diff --git a/programming_examples/average_pool/average_pool.py b/programming_examples/average_pool/average_pool.py index 2fd19c7e7..a55f6220b 100644 --- a/programming_examples/average_pool/average_pool.py +++ b/programming_examples/average_pool/average_pool.py @@ -12,11 +12,9 @@ Uses a 1x2 AIE herd with DMA transfers between L3 and L1 memory. """ -import argparse from ml_dtypes import bfloat16 from air.ir import * -from air.dialects.affine import apply as affine_apply from air.dialects.air import * from air.dialects import arith from air.dialects.arith import ConstantOp @@ -29,8 +27,7 @@ ) from air.dialects.func import FuncOp from air.dialects.scf import for_, yield_ -from air.backend.xrt_runner import XRTRunner, type_mapper -from air.backend.xrt import XRTBackend +from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu import numpy as np @@ -54,16 +51,8 @@ def build_module(m, n, tile_m, np_dtype_in): l3outputMemrefTy = MemRefType.get(out_size, xrt_dtype_in) # L1 MemRefTypes - l1MemrefTy = MemRefType.get( - shape=[tile_m, n], - element_type=xrt_dtype_in, - memory_space=IntegerAttr.get(T.i32(), MemorySpace.L1), - ) - l1outputMemrefTy = MemRefType.get( - shape=[tile_m, 1], - element_type=xrt_dtype_in, - memory_space=IntegerAttr.get(T.i32(), MemorySpace.L1), - ) + l1MemrefTy = l1_memref_type([tile_m, n], xrt_dtype_in) + l1outputMemrefTy = l1_memref_type([tile_m, 1], xrt_dtype_in) @FuncOp.from_py_func(l3memrefTy, l3outputMemrefTy) def average_pool(arg0, arg2): @@ -85,20 +74,7 @@ def herd_body( for _l_ivx in range_(0, m, tile_m * num_tiles): - offset_map = AffineMap.get( - 0, - 2, - [ - AffineExpr.get_add( - AffineSymbolExpr.get(0), - AffineExpr.get_mul( - AffineSymbolExpr.get(1), - AffineConstantExpr.get(tile_m), - ), - ) - ], - ) - offset = affine_apply(offset_map, [_l_ivx, _ty]) + offset = tile_offset_1d(_l_ivx, _ty, tile_m) dma_memcpy_nd( l1_a_data, @@ -151,16 +127,16 @@ def herd_body( ) cst0 = arith.ConstantOp(xrt_dtype_in, 0.0) v_a = transfer_read( - VectorType.get([n], xrt_dtype_in), + vec_type(n, xrt_dtype_in), collapse_a, [c0], - AffineMapAttr.get(AffineMap.get_identity(1)), + identity_map_attr(), cst0, [True], ) # Multiply by 1/N before reduction to avoid scalar bf16 # multiply which can produce corrupted output on AIE2. - v_inv_n = broadcast(VectorType.get([n], xrt_dtype_in), inv_n) + v_inv_n = broadcast(vec_type(n, xrt_dtype_in), inv_n) v_scaled = arith.mulf(v_a, v_inv_n) v_avg = reduction(xrt_dtype_in, CombiningKind.ADD, v_scaled) store(v_avg, collapse_c, [c0]) @@ -188,20 +164,7 @@ def herd_body( TILE_M = 256 INPUT_DATATYPE = bfloat16 - parser = argparse.ArgumentParser( - prog="run.py", - description="Builds, runs, and tests the AveragePool example", - ) - parser.add_argument( - "-v", - "--verbose", - action="store_true", - ) - parser.add_argument( - "-p", - "--print-module-only", - action="store_true", - ) + parser = make_air_parser("Builds, runs, and tests the AveragePool example") parser.add_argument( "--m", type=int, @@ -215,20 +178,6 @@ def herd_body( help="Input size (dimension N, pool width)", ) parser.add_argument("--tile-m", type=int, default=TILE_M, help="Tile size M") - parser.add_argument( - "--compile-mode", - type=str, - choices=["compile-only", "compile-and-run"], - dest="compile_mode", - default="compile-and-run", - ) - parser.add_argument( - "--output-format", - type=str, - choices=["xclbin", "elf"], - default="xclbin", - dest="output_format", - ) args = parser.parse_args() @@ -246,46 +195,29 @@ def herd_body( args.m, args.n ) - if args.compile_mode == "compile-and-run": - - num_samples = 100 - sampled_indices = np.vstack([np.random.randint(0, args.m, num_samples)]) - - # AveragePool reference: sum of (each element * 1/N) per row - inv_n_bf16 = INPUT_DATATYPE(1.0 / args.n) - sampled_values = np.array( - [np.sum(input_a[i] * inv_n_bf16) for i in zip(*sampled_indices)], - dtype=INPUT_DATATYPE, - ) + num_samples = 100 + sampled_indices = np.vstack([np.random.randint(0, args.m, num_samples)]) - sampled_data = { - "shape": (args.m,), - "indices": sampled_indices, - "values": sampled_values, - } + # AveragePool reference: sum of (each element * 1/N) per row + inv_n_bf16 = INPUT_DATATYPE(1.0 / args.n) + sampled_values = np.array( + [np.sum(input_a[i] * inv_n_bf16) for i in zip(*sampled_indices)], + dtype=INPUT_DATATYPE, + ) - runner = XRTRunner( - verbose=args.verbose, - omit_while_true_loop=False, - output_format=args.output_format, + sampled_data = { + "shape": (args.m,), + "indices": sampled_indices, + "values": sampled_values, + } + + exit( + run_on_npu( + args, + mlir_module, + inputs=[input_a], instance_name="average_pool", - runtime_loop_tiling_sizes=[4, 4], - ) - exit( - runner.run_test( - mlir_module, - inputs=[input_a], - stochastic_expected_outputs=[sampled_data], - rtol=1e-1, - ) + stochastic_expected_outputs=[sampled_data], + rtol=1e-1, ) - - elif args.compile_mode == "compile-only": - backend = XRTBackend( - verbose=args.verbose, - omit_while_true_loop=False, - output_format=args.output_format, - runtime_loop_tiling_sizes=[4, 4], - ) - module_function = backend.compile(mlir_module) - backend.unload() + ) diff --git a/programming_examples/axpy/axpy.py b/programming_examples/axpy/axpy.py index 27d925047..d5d25a5b8 100644 --- a/programming_examples/axpy/axpy.py +++ b/programming_examples/axpy/axpy.py @@ -13,20 +13,23 @@ with configurable VECTOR_SIZE (default 16). """ -import argparse +import os +import sys + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + from ml_dtypes import bfloat16 from air.ir import * -from air.dialects.affine import apply as affine_apply from air.dialects.air import * from air.dialects import arith from air.dialects.arith import ConstantOp -from air.dialects.memref import AllocOp, DeallocOp, subview -from air.dialects.vector import transfer_read, transfer_write, BroadcastOp, fma +from air.dialects.memref import AllocOp, DeallocOp +from air.dialects.vector import BroadcastOp, fma from air.dialects.func import FuncOp from air.dialects.scf import for_, yield_ -from air.backend.xrt_runner import XRTRunner, type_mapper -from air.backend.xrt import XRTBackend +from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu +from utils import vec_read, vec_write import numpy as np @@ -44,18 +47,10 @@ def build_module(n, tile_n, np_dtype_in, alpha=2.0, vector_size=16): VECTOR_SIZE = vector_size index_type = IndexType.get() - # L3 MemRefTypes l3memrefTy = MemRefType.get([n], xrt_dtype_in) - - # L1 MemRefTypes - l1MemrefTy = MemRefType.get( - shape=[tile_n], - element_type=xrt_dtype_in, - memory_space=IntegerAttr.get(T.i32(), MemorySpace.L1), - ) - - vecTy = VectorType.get([VECTOR_SIZE], xrt_dtype_in) - identity_map = AffineMapAttr.get(AffineMap.get_identity(1)) + l1MemrefTy = l1_memref_type([tile_n], xrt_dtype_in) + vecTy = vec_type(VECTOR_SIZE, xrt_dtype_in) + imap = identity_map_attr() @FuncOp.from_py_func(l3memrefTy, l3memrefTy, l3memrefTy) def axpy(arg0, arg1, arg2): @@ -80,21 +75,7 @@ def herd_body( l1_out_data = AllocOp(l1MemrefTy, [], []) for _l_ivx in range_(0, n, tile_n * num_tiles): - - offset_map = AffineMap.get( - 0, - 2, - [ - AffineExpr.get_add( - AffineSymbolExpr.get(0), - AffineExpr.get_mul( - AffineSymbolExpr.get(1), - AffineConstantExpr.get(tile_n), - ), - ) - ], - ) - offset = affine_apply(offset_map, [_l_ivx, _ty]) + offset = tile_offset_1d(_l_ivx, _ty, tile_n) dma_memcpy_nd( l1_x_data, @@ -121,29 +102,11 @@ def herd_body( v_a = BroadcastOp(vecTy, a_const) for j in range_(c0, cTileN, cVecSize): - sub_x = subview( - l1_x_data.result, - [j], - [VECTOR_SIZE], - [1], - ) - sub_y = subview( - l1_y_data.result, - [j], - [VECTOR_SIZE], - [1], - ) - sub_out = subview( - l1_out_data.result, - [j], - [VECTOR_SIZE], - [1], - ) - v_x = transfer_read(vecTy, sub_x, [c0], identity_map, cst0, [True]) - v_y = transfer_read(vecTy, sub_y, [c0], identity_map, cst0, [True]) + v_x = vec_read(l1_x_data, j, VECTOR_SIZE, c0, vecTy, cst0, imap) + v_y = vec_read(l1_y_data, j, VECTOR_SIZE, c0, vecTy, cst0, imap) # a * x + y via vector.fma v_result = fma(v_a, v_x, v_y) - transfer_write(None, v_result, sub_out, [c0], identity_map, [True]) + vec_write(v_result, l1_out_data, j, VECTOR_SIZE, c0, imap) yield_([]) # Write result from l1_out back to L3 output buffer @@ -167,12 +130,7 @@ def herd_body( INPUT_DATATYPE = bfloat16 ALPHA = 2.0 - parser = argparse.ArgumentParser( - prog="run.py", - description="Builds, runs, and tests the AXPY example", - ) - parser.add_argument("-v", "--verbose", action="store_true") - parser.add_argument("-p", "--print-module-only", action="store_true") + parser = make_air_parser("Builds, runs, and tests the AXPY example") parser.add_argument("--n", type=int, default=N, help="Total number of elements") parser.add_argument("--tile-n", type=int, default=TILE_N, help="Tile size") parser.add_argument( @@ -184,20 +142,6 @@ def herd_body( default=16, help="Vector size for SIMD operations", ) - parser.add_argument( - "--compile-mode", - type=str, - choices=["compile-only", "compile-and-run"], - dest="compile_mode", - default="compile-and-run", - ) - parser.add_argument( - "--output-format", - type=str, - choices=["xclbin", "elf"], - default="xclbin", - dest="output_format", - ) args = parser.parse_args() @@ -211,41 +155,24 @@ def herd_body( input_x = np.random.randn(args.n).astype(INPUT_DATATYPE) input_y = np.random.randn(args.n).astype(INPUT_DATATYPE) - if args.compile_mode == "compile-and-run": - num_samples = 100 - sampled_indices = np.vstack([np.random.randint(0, args.n, num_samples)]) - sampled_values = np.array( - [(args.alpha * input_x[i] + input_y[i]) for i in zip(*sampled_indices)], - dtype=INPUT_DATATYPE, - ) - sampled_data = { - "shape": (args.n,), - "indices": sampled_indices, - "values": sampled_values, - } - - runner = XRTRunner( - verbose=args.verbose, - omit_while_true_loop=False, - output_format=args.output_format, + sampled_indices = np.vstack([np.random.randint(0, args.n, 100)]) + sampled_values = np.array( + [args.alpha * input_x[i] + input_y[i] for i in zip(*sampled_indices)], + dtype=INPUT_DATATYPE, + ) + sampled_data = { + "shape": (args.n,), + "indices": sampled_indices, + "values": sampled_values, + } + + exit( + run_on_npu( + args, + mlir_module, + inputs=[input_x, input_y], instance_name="axpy", - runtime_loop_tiling_sizes=[4, 4], + stochastic_expected_outputs=[sampled_data], + rtol=1e-2, ) - exit( - runner.run_test( - mlir_module, - inputs=[input_x, input_y], - stochastic_expected_outputs=[sampled_data], - rtol=1e-2, - ) - ) - - elif args.compile_mode == "compile-only": - backend = XRTBackend( - verbose=args.verbose, - omit_while_true_loop=False, - output_format=args.output_format, - runtime_loop_tiling_sizes=[4, 4], - ) - module_function = backend.compile(mlir_module) - backend.unload() + ) diff --git a/programming_examples/bottleneck/bottleneck.py b/programming_examples/bottleneck/bottleneck.py index 5762cdfae..0a7eeba45 100644 --- a/programming_examples/bottleneck/bottleneck.py +++ b/programming_examples/bottleneck/bottleneck.py @@ -42,7 +42,6 @@ enabling zero-copy data transfer between neighboring cores. """ -import argparse import numpy as np from air.ir import * @@ -53,8 +52,7 @@ from air.dialects.memref import AllocOp, DeallocOp from air.dialects.func import FuncOp, CallOp from air.dialects.scf import for_, yield_ -from air.backend.xrt_runner import XRTRunner, type_mapper -from air.backend.xrt import XRTBackend +from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu range_ = for_ @@ -105,62 +103,38 @@ def build_module(): l3_wts_ty = MemRefType.get((TOTAL_WEIGHTS,), i8) l3_act_out_ty = MemRefType.get((ACTIVATIONS_OUT,), i8) - # L2 memory space - l2_mem_space = IntegerAttr.get(i32, MemorySpace.L2) - - # L1 memory space - l1_mem_space = IntegerAttr.get(i32, MemorySpace.L1) - # Per-row tile types (processing one row at a time for depth-first dataflow) # Layer 1 input: one row of 32 pixels with 256 input channels - l1_layer1_in_ty = MemRefType.get( - (TENSOR_IN_W, 1, TENSOR_L1_IN_C), i8, memory_space=l1_mem_space - ) - l1_wts_layer1_ty = MemRefType.get((WEIGHTS_L1_SZ,), i8, memory_space=l1_mem_space) - l1_layer1_out_ty = MemRefType.get( - (TENSOR_IN_W, 1, TENSOR_L1_OUT_C), i8, memory_space=l1_mem_space - ) + l1_layer1_in_ty = l1_memref_type((TENSOR_IN_W, 1, TENSOR_L1_IN_C), i8) + l1_wts_layer1_ty = l1_memref_type((WEIGHTS_L1_SZ,), i8) + l1_layer1_out_ty = l1_memref_type((TENSOR_IN_W, 1, TENSOR_L1_OUT_C), i8) # Layer 2 (3x3 conv) types - l1_layer2_in_ty = MemRefType.get( - (TENSOR_IN_W, 1, TENSOR_L2_IN_C), i8, memory_space=l1_mem_space - ) + l1_layer2_in_ty = l1_memref_type((TENSOR_IN_W, 1, TENSOR_L2_IN_C), i8) # L1 weights for layer 2 (36KB fits in AIE2's 64KB L1) - l1_wts_layer2_ty = MemRefType.get((WEIGHTS_L2_SZ,), i8, memory_space=l1_mem_space) + l1_wts_layer2_ty = l1_memref_type((WEIGHTS_L2_SZ,), i8) # Each 3x3 core produces half the output channels - l1_layer2_out_ty = MemRefType.get( - (TENSOR_IN_W, 1, TENSOR_L2_OUT_C // 2), i8, memory_space=l1_mem_space - ) + l1_layer2_out_ty = l1_memref_type((TENSOR_IN_W, 1, TENSOR_L2_OUT_C // 2), i8) # Combined output buffer for both 3x3 conv cores (shared L1, flat 1D) # Core 0 writes first 1024 bytes, Core 1 writes next 1024 bytes CONV3X3_OUT_HALF_SIZE = TENSOR_IN_W * 1 * (TENSOR_L2_OUT_C // 2) # 1024 - l1_layer2_out_combined_ty = MemRefType.get( - (CONV3X3_OUT_HALF_SIZE * 2,), i8, memory_space=l1_mem_space - ) + l1_layer2_out_combined_ty = l1_memref_type((CONV3X3_OUT_HALF_SIZE * 2,), i8) # Layer 3 (1x1 conv + skip) types - l1_layer3_in_ty = MemRefType.get( - (TENSOR_IN_W, 1, TENSOR_L3_IN_C // 2), i8, memory_space=l1_mem_space - ) - l1_wts_layer3_ty = MemRefType.get((WEIGHTS_L3_SZ,), i8, memory_space=l1_mem_space) - l1_layer3_out_ty = MemRefType.get( - (TENSOR_IN_W, 1, TENSOR_L3_OUT_C), i8, memory_space=l1_mem_space - ) + l1_layer3_in_ty = l1_memref_type((TENSOR_IN_W, 1, TENSOR_L3_IN_C // 2), i8) + l1_wts_layer3_ty = l1_memref_type((WEIGHTS_L3_SZ,), i8) + l1_layer3_out_ty = l1_memref_type((TENSOR_IN_W, 1, TENSOR_L3_OUT_C), i8) # L2 buffer types for skip connection - l2_skip_buf_ty = MemRefType.get( - (TENSOR_IN_W, 1, TENSOR_L1_IN_C), i8, memory_space=l2_mem_space - ) + l2_skip_buf_ty = l2_memref_type((TENSOR_IN_W, 1, TENSOR_L1_IN_C), i8) # L2 buffer type for output - l2_out_buf_ty = MemRefType.get( - (TENSOR_IN_W, 1, TENSOR_L3_OUT_C), i8, memory_space=l2_mem_space - ) + l2_out_buf_ty = l2_memref_type((TENSOR_IN_W, 1, TENSOR_L3_OUT_C), i8) # L2 buffer types for weight staging - l2_wts_layer1_ty = MemRefType.get((WEIGHTS_L1_SZ,), i8, memory_space=l2_mem_space) - l2_wts_layer2_ty = MemRefType.get((WEIGHTS_L2_SZ,), i8, memory_space=l2_mem_space) - l2_wts_layer3_ty = MemRefType.get((WEIGHTS_L3_SZ,), i8, memory_space=l2_mem_space) + l2_wts_layer1_ty = l2_memref_type((WEIGHTS_L1_SZ,), i8) + l2_wts_layer2_ty = l2_memref_type((WEIGHTS_L2_SZ,), i8) + l2_wts_layer3_ty = l2_memref_type((WEIGHTS_L3_SZ,), i8) # Declare external convolution kernel functions # These would be linked from compiled convolution kernels @@ -951,29 +925,7 @@ def compute_golden_reference( if __name__ == "__main__": - parser = argparse.ArgumentParser( - prog="bottleneck.py", - description="Builds, runs, and tests the bottleneck block example", - ) - parser.add_argument( - "-v", - "--verbose", - action="store_true", - ) - parser.add_argument( - "-p", - "--print-module-only", - action="store_true", - help="Print MLIR IR and exit", - ) - parser.add_argument( - "--compile-mode", - type=str, - choices=["compile-only", "compile-and-run"], - dest="compile_mode", - default="compile-and-run", - help="Configure whether to run after compile", - ) + parser = make_air_parser("Builds, runs, and tests the bottleneck block example") parser.add_argument( "--debug-ir", action="store_true", @@ -1059,52 +1011,31 @@ def compute_golden_reference( print(f"Expected output shape: {expected_out.shape}") print("\nRunning AIR bottleneck design...") - runner = XRTRunner( - verbose=args.verbose, - omit_while_true_loop=False, - debug_ir=args.debug_ir, - omit_pingpong="all", # Disable all ping-pong to avoid shared buffer sync issues, - runtime_loop_tiling_sizes=[4, 4], - ) - - # Custom comparison with scale factor tolerance - def compare_with_tolerance(actual, expected): - """Compare outputs with tolerance based on quantization scale.""" - actual_scaled = actual.astype(np.float32) * inp_scale4 - expected_scaled = expected.astype(np.float32) * inp_scale4 - - if np.allclose(actual_scaled, expected_scaled, rtol=0, atol=inp_scale4): - print("\n✓ PASS: Output matches golden reference!") - return True - else: - diff = np.abs(actual_scaled - expected_scaled) - print(f"\n✗ FAIL: Output mismatch") - print(f" Max difference: {diff.max():.4f}") - print(f" Mean difference: {diff.mean():.4f}") - print( - f" Mismatched elements: {np.sum(diff > inp_scale4)} / {len(diff)}" - ) - return False - exit( - runner.run_test( + run_on_npu( + args, mlir_module, inputs=[input_act_flat, total_wts], + instance_name="bottleneck_block", expected_outputs=[expected_out], rtol=0, atol=1, # Allow 1 unit of quantization error + runtime_loop_tiling_sizes=[4, 4], + debug_ir=args.debug_ir, + omit_pingpong="all", # Disable all ping-pong to avoid shared buffer sync issues ) ) elif args.compile_mode == "compile-only": print("\nCompiling AIR bottleneck design (no execution)...") - backend = XRTBackend( - verbose=args.verbose, - omit_while_true_loop=False, - debug_ir=args.debug_ir, - omit_pingpong="all", # Disable all ping-pong to avoid shared buffer sync issues, - runtime_loop_tiling_sizes=[4, 4], + exit( + run_on_npu( + args, + mlir_module, + inputs=[], + instance_name="bottleneck_block", + runtime_loop_tiling_sizes=[4, 4], + debug_ir=args.debug_ir, + omit_pingpong="all", # Disable all ping-pong to avoid shared buffer sync issues + ) ) - module_function = backend.compile(mlir_module) - backend.unload() - print("Compilation successful!") diff --git a/programming_examples/bottleneck/bottleneck_mlir.py b/programming_examples/bottleneck/bottleneck_mlir.py index b1a2bcd83..e62a456d5 100644 --- a/programming_examples/bottleneck/bottleneck_mlir.py +++ b/programming_examples/bottleneck/bottleneck_mlir.py @@ -57,8 +57,8 @@ from air.dialects import scf from air.dialects.scf import for_, yield_ from air.dialects import vector as vector_dialect -from air.backend.xrt_runner import XRTRunner -from air.backend.xrt import XRTBackend +import aie.utils +from air.backend.xrt import compile_air, get_air_runtime range_ = for_ @@ -1542,27 +1542,31 @@ def compare_with_tolerance(actual, expected): return False # Compile and run directly to get actual outputs for custom comparison - # (XRTRunner._check_outputs uses exact match for integers, but AIE2P - # SRS positive_inf rounding can differ by 1 from Python's rounding) + # (run_test uses atol check, but AIE2P SRS positive_inf rounding can + # differ by 1 from Python's rounding, so we use a custom comparison.) import filelock - backend = XRTBackend( + npu_kernel = compile_air( + mlir_module, verbose=args.verbose, - omit_while_true_loop=False, + output_format=args.output_format, debug_ir=args.debug_ir, omit_pingpong="all", runtime_loop_tiling_sizes=[4, 4], + instance_name="bottleneck_block", ) output_placeholder = np.zeros(expected_out.shape, expected_out.dtype) - expanded_inputs = [input_act_flat, total_wts, output_placeholder] - - compiled_module = backend.compile(mlir_module) + runtime = get_air_runtime() + io_args = [ + aie.utils.tensor(input_act_flat), + aie.utils.tensor(total_wts), + aie.utils.tensor(output_placeholder), + ] + handle = runtime.load(npu_kernel) with filelock.FileLock("/tmp/npu.lock"): - module_function = backend.load(compiled_module) - actual_outputs = module_function(*expanded_inputs) - backend.unload() + runtime.run(handle, io_args) - actual_out = actual_outputs[len([input_act_flat, total_wts])] + actual_out = io_args[len([input_act_flat, total_wts])].numpy() if compare_with_tolerance(actual_out, expected_out): print("PASS!") @@ -1572,13 +1576,13 @@ def compare_with_tolerance(actual, expected): elif args.compile_mode == "compile-only": print("\nCompiling AIR bottleneck design (no execution)...") - backend = XRTBackend( + compile_air( + mlir_module, verbose=args.verbose, - omit_while_true_loop=False, + output_format=args.output_format, debug_ir=args.debug_ir, - omit_pingpong="all", # Disable all ping-pong to avoid shared buffer sync issues, + omit_pingpong="all", runtime_loop_tiling_sizes=[4, 4], + instance_name="bottleneck_block", ) - module_function = backend.compile(mlir_module) - backend.unload() print("Compilation successful!") diff --git a/programming_examples/cascade_reduction/cascade_reduction.py b/programming_examples/cascade_reduction/cascade_reduction.py index 10cb7ff92..089bad3ca 100644 --- a/programming_examples/cascade_reduction/cascade_reduction.py +++ b/programming_examples/cascade_reduction/cascade_reduction.py @@ -16,16 +16,13 @@ Final result: output = input + 4 """ -import argparse - from air.ir import * from air.dialects.air import * from air.dialects import arith, linalg, memref, scf from air.dialects.memref import AllocOp from air.dialects.func import FuncOp from air.dialects.scf import for_, yield_ -from air.backend.xrt_runner import XRTRunner, type_mapper -from air.backend.xrt import XRTBackend +from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu import numpy as np @@ -45,11 +42,7 @@ def build_module(): # L3 types l3MemrefTy = MemRefType.get(data_shape, xrt_dtype) # L1 types - l1MemrefTy = MemRefType.get( - data_shape, - xrt_dtype, - memory_space=IntegerAttr.get(T.i32(), MemorySpace.L1), - ) + l1MemrefTy = l1_memref_type(data_shape, xrt_dtype) # Channels: chan_in/chan_out use DMA (L3<->L1), chan_cascade uses # direct core-to-core cascade connections between adjacent tiles. @@ -125,26 +118,7 @@ def herd_body(tx, ty, sx, sy): if __name__ == "__main__": - parser = argparse.ArgumentParser( - prog="run.py", - description="Builds, runs, and tests the cascade reduction example", - ) - parser.add_argument("-v", "--verbose", action="store_true") - parser.add_argument("-p", "--print-module-only", action="store_true") - parser.add_argument( - "--compile-mode", - type=str, - choices=["compile-only", "compile-and-run"], - dest="compile_mode", - default="compile-and-run", - ) - parser.add_argument( - "--output-format", - type=str, - choices=["xclbin", "elf"], - default="xclbin", - dest="output_format", - ) + parser = make_air_parser("Builds, runs, and tests the cascade reduction example") args = parser.parse_args() mlir_module = build_module() @@ -154,48 +128,32 @@ def herd_body(tx, ty, sx, sy): input_a = np.arange(0, DATA_SIZE, dtype=np.int32).reshape(1, 1, DATA_SIZE) - if args.compile_mode == "compile-and-run": - num_samples = 100 - sampled_indices = np.vstack( - [ - np.zeros(num_samples, dtype=int), - np.zeros(num_samples, dtype=int), - np.random.randint(0, DATA_SIZE, num_samples), - ] - ) - - sampled_values = np.array( - [input_a[i, j, k] + NUM_TILES for i, j, k in zip(*sampled_indices)], - dtype=np.int32, - ) - - sampled_data = { - "shape": (1, 1, DATA_SIZE), - "indices": sampled_indices, - "values": sampled_values, - } - - runner = XRTRunner( - verbose=args.verbose, - omit_while_true_loop=False, - output_format=args.output_format, - instance_name="cascade_reduce", - runtime_loop_tiling_sizes=[4, 4], - ) - exit( - runner.run_test( - mlir_module, - inputs=[input_a], - stochastic_expected_outputs=[sampled_data], - ) - ) - - elif args.compile_mode == "compile-only": - backend = XRTBackend( - verbose=args.verbose, - omit_while_true_loop=False, - output_format=args.output_format, - runtime_loop_tiling_sizes=[4, 4], - ) - module_function = backend.compile(mlir_module) - backend.unload() + num_samples = 100 + sampled_indices = np.vstack( + [ + np.zeros(num_samples, dtype=int), + np.zeros(num_samples, dtype=int), + np.random.randint(0, DATA_SIZE, num_samples), + ] + ) + + sampled_values = np.array( + [input_a[i, j, k] + NUM_TILES for i, j, k in zip(*sampled_indices)], + dtype=np.int32, + ) + + sampled_data = { + "shape": (1, 1, DATA_SIZE), + "indices": sampled_indices, + "values": sampled_values, + } + + run_on_npu( + args, + mlir_module, + inputs=[input_a], + stochastic_expected_outputs=[sampled_data], + instance_name="cascade_reduce", + omit_while_true_loop=False, + runtime_loop_tiling_sizes=[4, 4], + ) diff --git a/programming_examples/channel_examples/broadcast/multi_herd/broadcast.py b/programming_examples/channel_examples/broadcast/multi_herd/broadcast.py index c79d4d63b..e77de9daa 100644 --- a/programming_examples/channel_examples/broadcast/multi_herd/broadcast.py +++ b/programming_examples/channel_examples/broadcast/multi_herd/broadcast.py @@ -1,6 +1,5 @@ # Copyright (C) 2024, Advanced Micro Devices, Inc. # SPDX-License-Identifier: MIT -import argparse import numpy as np from air.ir import * @@ -8,7 +7,7 @@ from air.dialects.memref import AllocOp, DeallocOp, load, store from air.dialects.func import FuncOp from air.dialects.scf import for_, yield_ -from air.backend.xrt_runner import XRTRunner, type_mapper +from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu range_ = for_ @@ -26,12 +25,7 @@ def build_module(): xrt_dtype = type_mapper(INOUT_DATATYPE) memrefTyInOut = MemRefType.get(IMAGE_SIZE, xrt_dtype) - mem_space_l1 = IntegerAttr.get(T.i32(), MemorySpace.L1) - image_type_l1 = MemRefType.get( - shape=IMAGE_SIZE, - element_type=xrt_dtype, - memory_space=mem_space_l1, - ) + image_type_l1 = l1_memref_type(IMAGE_SIZE, xrt_dtype) Channel("ChanIn", size=[1, 1], broadcast_shape=[3, 1]) for name in OUTPUT_HERD_NAMES: @@ -87,27 +81,8 @@ def herd_body(_tx, _ty, _sx, _sy): if __name__ == "__main__": - parser = argparse.ArgumentParser( - prog="run.py", - description="Builds, runs, and tests the channel broadcast multi herd example", - ) - parser.add_argument( - "-v", - "--verbose", - action="store_true", - ) - parser.add_argument( - "-p", - "--print-module-only", - action="store_true", - ) - parser.add_argument( - "--output-format", - type=str, - choices=["xclbin", "elf"], - default="xclbin", - dest="output_format", - help="Output format for the compiled binary (default: xclbin)", + parser = make_air_parser( + "Builds, runs, and tests the channel broadcast multi herd example" ) args = parser.parse_args() @@ -128,16 +103,12 @@ def herd_body(_tx, _ty, _sx, _sy): IMAGE_SIZE ) - runner = XRTRunner( - verbose=args.verbose, - output_format=args.output_format, - instance_name="copy", - runtime_loop_tiling_sizes=[4, 4], - ) exit( - runner.run_test( + run_on_npu( + args, mlir_module, inputs=[input_a], + instance_name="copy", expected_outputs=[output_b, output_c, output_d], ) ) diff --git a/programming_examples/channel_examples/broadcast/single_herd/broadcast.py b/programming_examples/channel_examples/broadcast/single_herd/broadcast.py index 381db1c70..c7eab91e6 100644 --- a/programming_examples/channel_examples/broadcast/single_herd/broadcast.py +++ b/programming_examples/channel_examples/broadcast/single_herd/broadcast.py @@ -1,6 +1,5 @@ # Copyright (C) 2024, Advanced Micro Devices, Inc. # SPDX-License-Identifier: MIT -import argparse import numpy as np from air.ir import * @@ -8,7 +7,7 @@ from air.dialects.memref import AllocOp, DeallocOp, load, store from air.dialects.func import FuncOp from air.dialects.scf import for_, yield_ -from air.backend.xrt_runner import XRTRunner, type_mapper +from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu range_ = for_ @@ -24,12 +23,7 @@ def build_module(): xrt_dtype = type_mapper(INOUT_DATATYPE) memrefTyInOut = MemRefType.get(IMAGE_SIZE, xrt_dtype) - mem_space_l1 = IntegerAttr.get(T.i32(), MemorySpace.L1) - image_type_l1 = MemRefType.get( - shape=IMAGE_SIZE, - element_type=xrt_dtype, - memory_space=mem_space_l1, - ) + image_type_l1 = l1_memref_type(IMAGE_SIZE, xrt_dtype) Channel("ChanIn", size=[1, 1], broadcast_shape=[1, 3]) Channel("ChanOut", size=[1, 3]) @@ -81,27 +75,8 @@ def herd_body(tx, ty, _sx, _sy): if __name__ == "__main__": - parser = argparse.ArgumentParser( - prog="run.py", - description="Builds, runs, and tests the channel broadcast multi herd example", - ) - parser.add_argument( - "-v", - "--verbose", - action="store_true", - ) - parser.add_argument( - "-p", - "--print-module-only", - action="store_true", - ) - parser.add_argument( - "--output-format", - type=str, - choices=["xclbin", "elf"], - default="xclbin", - dest="output_format", - help="Output format for the compiled binary (default: xclbin)", + parser = make_air_parser( + "Builds, runs, and tests the channel broadcast multi herd example" ) args = parser.parse_args() @@ -122,16 +97,12 @@ def herd_body(tx, ty, _sx, _sy): IMAGE_SIZE ) - runner = XRTRunner( - verbose=args.verbose, - output_format=args.output_format, - instance_name="copy", - runtime_loop_tiling_sizes=[4, 4], - ) exit( - runner.run_test( + run_on_npu( + args, mlir_module, inputs=[input_a], + instance_name="copy", expected_outputs=[output_b, output_c, output_d], ) ) diff --git a/programming_examples/channel_examples/broadcast_selective_capture/broadcast_selective_capture.py b/programming_examples/channel_examples/broadcast_selective_capture/broadcast_selective_capture.py index a31182df5..c926baae1 100644 --- a/programming_examples/channel_examples/broadcast_selective_capture/broadcast_selective_capture.py +++ b/programming_examples/channel_examples/broadcast_selective_capture/broadcast_selective_capture.py @@ -18,7 +18,6 @@ # The net effect is equivalent to a non-broadcast scatter, but implemented # over a single broadcast channel to conserve DMA channels. -import argparse import numpy as np from air.ir import * @@ -27,7 +26,7 @@ from air.dialects.func import FuncOp from air.dialects import arith, scf from air.dialects.scf import for_, yield_ -from air.backend.xrt_runner import XRTRunner, type_mapper +from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu range_ = for_ @@ -45,12 +44,7 @@ def build_module(): memrefTyIn = MemRefType.get([total_size], xrt_dtype) memrefTyOut = MemRefType.get([total_size], xrt_dtype) - mem_space_l1 = IntegerAttr.get(T.i32(), MemorySpace.L1) - tile_type_l1 = MemRefType.get( - shape=[TILE_SIZE], - element_type=xrt_dtype, - memory_space=mem_space_l1, - ) + tile_type_l1 = l1_memref_type([TILE_SIZE], xrt_dtype) # Broadcast channel: size [1, 1] broadcast to [1, NUM_TILES] # All cores in the herd receive the same data on each put. @@ -132,27 +126,8 @@ def herd_body(tx, ty, _sx, _sy): if __name__ == "__main__": - parser = argparse.ArgumentParser( - prog="run.py", - description="Builds, runs, and tests the broadcast selective capture example", - ) - parser.add_argument( - "-v", - "--verbose", - action="store_true", - ) - parser.add_argument( - "-p", - "--print-module-only", - action="store_true", - ) - parser.add_argument( - "--output-format", - type=str, - choices=["xclbin", "elf"], - default="xclbin", - dest="output_format", - help="Output format for the compiled binary (default: xclbin)", + parser = make_air_parser( + "Builds, runs, and tests the broadcast selective capture example" ) args = parser.parse_args() @@ -175,16 +150,12 @@ def herd_body(tx, ty, _sx, _sy): end = start + TILE_SIZE expected_output[start:end] = input_a[start:end] + ty - runner = XRTRunner( - verbose=args.verbose, - output_format=args.output_format, - instance_name="broadcast_selective_capture", - runtime_loop_tiling_sizes=[4, 4], - ) exit( - runner.run_test( + run_on_npu( + args, mlir_module, inputs=[input_a], + instance_name="broadcast_selective_capture", expected_outputs=[expected_output], ) ) diff --git a/programming_examples/channel_examples/channel_3d_segment_unroll/channel_3d_segment_unroll.py b/programming_examples/channel_examples/channel_3d_segment_unroll/channel_3d_segment_unroll.py index 2581f1652..d42ea2b5c 100644 --- a/programming_examples/channel_examples/channel_3d_segment_unroll/channel_3d_segment_unroll.py +++ b/programming_examples/channel_examples/channel_3d_segment_unroll/channel_3d_segment_unroll.py @@ -23,16 +23,13 @@ output[seg, ty] = sum_{tx=0}^{3} input[seg, tx, ty] """ -import argparse - from air.ir import * from air.dialects.air import * from air.dialects import arith, linalg, scf from air.dialects.memref import AllocOp from air.dialects.func import FuncOp from air.dialects.scf import for_, yield_ -from air.backend.xrt_runner import XRTRunner, type_mapper -from air.backend.xrt import XRTBackend +from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu import numpy as np @@ -59,11 +56,7 @@ def build_module(): l3MemrefTyOut = MemRefType.get([TOTAL_OUT], xrt_dtype) # L1 type: one tile per core - l1MemrefTy = MemRefType.get( - tile_shape, - xrt_dtype, - memory_space=IntegerAttr.get(T.i32(), MemorySpace.L1), - ) + l1MemrefTy = l1_memref_type(tile_shape, xrt_dtype) # 3D input channel: [NUM_SEGMENTS, NUM_TILES, NUM_COLS]. # Each core (seg, tx, ty) gets its own unique tile. @@ -193,18 +186,8 @@ def herd_body(tx, ty, sx, sy, herd_seg_x): if __name__ == "__main__": - parser = argparse.ArgumentParser( - prog="run.py", - description="Builds, runs, and tests the 3D channel with segment unroll example", - ) - parser.add_argument("-v", "--verbose", action="store_true") - parser.add_argument("-p", "--print-module-only", action="store_true") - parser.add_argument( - "--output-format", - type=str, - choices=["xclbin", "elf"], - default="xclbin", - dest="output_format", + parser = make_air_parser( + "Builds, runs, and tests the 3D channel with segment unroll example" ) args = parser.parse_args() @@ -231,17 +214,12 @@ def herd_body(tx, ty, sx, sy, herd_seg_x): in_start : in_start + DATA_SIZE ] - runner = XRTRunner( - verbose=args.verbose, - omit_while_true_loop=False, - output_format=args.output_format, - instance_name="channel_3d_segment_unroll", - runtime_loop_tiling_sizes=[4, 4], - ) exit( - runner.run_test( + run_on_npu( + args, mlir_module, inputs=[input_a], + instance_name="channel_3d_segment_unroll", expected_outputs=[expected_output], ) ) diff --git a/programming_examples/channel_examples/channel_size/channel_size.py b/programming_examples/channel_examples/channel_size/channel_size.py index 0741f223d..e25ff299c 100644 --- a/programming_examples/channel_examples/channel_size/channel_size.py +++ b/programming_examples/channel_examples/channel_size/channel_size.py @@ -1,6 +1,5 @@ # Copyright (C) 2024, Advanced Micro Devices, Inc. # SPDX-License-Identifier: MIT -import argparse import numpy as np np.random.seed(42) @@ -10,7 +9,7 @@ from air.dialects.memref import AllocOp, DeallocOp, load, store from air.dialects.func import FuncOp from air.dialects.scf import for_, yield_ -from air.backend.xrt_runner import XRTRunner, type_mapper +from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu range_ = for_ @@ -86,15 +85,8 @@ def segment_body(): ) def herd_body(th, tw, _sx, _sy): - # We want to store our data in L1 memory - mem_space = IntegerAttr.get(T.i32(), MemorySpace.L1) - # This is the type definition of the tile - tile_type = MemRefType.get( - shape=TILE_SIZE, - element_type=xrt_dtype, - memory_space=mem_space, - ) + tile_type = l1_memref_type(TILE_SIZE, xrt_dtype) # We must allocate a buffer of tile size for the input/output tile_in = AllocOp(tile_type, [], []) @@ -123,28 +115,7 @@ def herd_body(th, tw, _sx, _sy): if __name__ == "__main__": - parser = argparse.ArgumentParser( - prog="run.py", - description="Builds, runs, and tests the channel_size example", - ) - parser.add_argument( - "-v", - "--verbose", - action="store_true", - ) - parser.add_argument( - "-p", - "--print-module-only", - action="store_true", - ) - parser.add_argument( - "--output-format", - type=str, - choices=["xclbin", "elf"], - default="xclbin", - dest="output_format", - help="Output format for the compiled binary (default: xclbin)", - ) + parser = make_air_parser("Builds, runs, and tests the channel_size example") args = parser.parse_args() @@ -161,14 +132,12 @@ def herd_body(th, tw, _sx, _sy): ) output_matrix = input_matrix.copy() - runner = XRTRunner( - verbose=args.verbose, - output_format=args.output_format, - instance_name="copy", - runtime_loop_tiling_sizes=[4, 4], - ) exit( - runner.run_test( - mlir_module, inputs=[input_matrix], expected_outputs=[output_matrix] + run_on_npu( + args, + mlir_module, + inputs=[input_matrix], + instance_name="copy", + expected_outputs=[output_matrix], ) ) diff --git a/programming_examples/channel_examples/herd_to_herd/multi_segment/herd_to_herd.py b/programming_examples/channel_examples/herd_to_herd/multi_segment/herd_to_herd.py index bb46f20a5..ef5b21bbb 100644 --- a/programming_examples/channel_examples/herd_to_herd/multi_segment/herd_to_herd.py +++ b/programming_examples/channel_examples/herd_to_herd/multi_segment/herd_to_herd.py @@ -1,6 +1,5 @@ # Copyright (C) 2024, Advanced Micro Devices, Inc. # SPDX-License-Identifier: MIT -import argparse import numpy as np from air.ir import * @@ -8,7 +7,7 @@ from air.dialects.memref import AllocOp, DeallocOp, load, store from air.dialects.func import FuncOp from air.dialects.scf import for_, yield_ -from air.backend.xrt_runner import XRTRunner, type_mapper +from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu range_ = for_ @@ -24,15 +23,8 @@ def build_module(): xrt_dtype = type_mapper(INOUT_DATATYPE) memrefTyInOut = MemRefType.get(IMAGE_SIZE, xrt_dtype) - # We want to store our data in L1 memory - mem_space_l1 = IntegerAttr.get(T.i32(), MemorySpace.L1) - # This is the type definition of the tile - image_type_l1 = MemRefType.get( - shape=IMAGE_SIZE, - element_type=xrt_dtype, - memory_space=mem_space_l1, - ) + image_type_l1 = l1_memref_type(IMAGE_SIZE, xrt_dtype) # Create two channels which will send/receive the # input/output data respectively @@ -120,28 +112,7 @@ def herd_body(tx, ty, sx, sy): if __name__ == "__main__": - parser = argparse.ArgumentParser( - prog="run.py", - description="Builds, runs, and tests the herd_to_herd channel example", - ) - parser.add_argument( - "-v", - "--verbose", - action="store_true", - ) - parser.add_argument( - "-p", - "--print-module-only", - action="store_true", - ) - parser.add_argument( - "--output-format", - type=str, - choices=["xclbin", "elf"], - default="xclbin", - dest="output_format", - help="Output format for the compiled binary (default: xclbin)", - ) + parser = make_air_parser("Builds, runs, and tests the herd_to_herd channel example") args = parser.parse_args() @@ -153,10 +124,12 @@ def herd_body(tx, ty, sx, sy): input_a = np.full(IMAGE_SIZE, 0x2, dtype=INOUT_DATATYPE) output_b = np.full(IMAGE_SIZE, 0x5, dtype=INOUT_DATATYPE) - runner = XRTRunner( - verbose=args.verbose, - output_format=args.output_format, - instance_name="copy", - runtime_loop_tiling_sizes=[4, 4], + exit( + run_on_npu( + args, + mlir_module, + inputs=[input_a], + instance_name="copy", + expected_outputs=[output_b], + ) ) - exit(runner.run_test(mlir_module, inputs=[input_a], expected_outputs=[output_b])) diff --git a/programming_examples/channel_examples/herd_to_herd/single_segment/herd_to_herd.py b/programming_examples/channel_examples/herd_to_herd/single_segment/herd_to_herd.py index 65c77f0ac..0ea651a79 100644 --- a/programming_examples/channel_examples/herd_to_herd/single_segment/herd_to_herd.py +++ b/programming_examples/channel_examples/herd_to_herd/single_segment/herd_to_herd.py @@ -1,6 +1,5 @@ # Copyright (C) 2024, Advanced Micro Devices, Inc. # SPDX-License-Identifier: MIT -import argparse import numpy as np from air.ir import * @@ -9,7 +8,7 @@ from air.dialects.memref import AllocOp, DeallocOp, load, store from air.dialects.func import FuncOp from air.dialects.scf import for_, yield_ -from air.backend.xrt_runner import XRTRunner, type_mapper +from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu range_ = for_ @@ -41,15 +40,8 @@ def build_module(): xrt_dtype = type_mapper(INOUT_DATATYPE) memrefTyInOut = MemRefType.get(IMAGE_SIZE, xrt_dtype) - # We want to store our data in L1 memory - mem_space_l1 = IntegerAttr.get(T.i32(), MemorySpace.L1) - # This is the type definition of the tile - image_type_l1 = MemRefType.get( - shape=IMAGE_SIZE, - element_type=xrt_dtype, - memory_space=mem_space_l1, - ) + image_type_l1 = l1_memref_type(IMAGE_SIZE, xrt_dtype) # Create two channels which will send/receive the # input/output data respectively @@ -128,28 +120,7 @@ def herd_body(tx, ty, sx, sy): if __name__ == "__main__": - parser = argparse.ArgumentParser( - prog="run.py", - description="Builds, runs, and tests the herd_to_herd channel example", - ) - parser.add_argument( - "-v", - "--verbose", - action="store_true", - ) - parser.add_argument( - "-p", - "--print-module-only", - action="store_true", - ) - parser.add_argument( - "--output-format", - type=str, - choices=["xclbin", "elf"], - default="xclbin", - dest="output_format", - help="Output format for the compiled binary (default: xclbin)", - ) + parser = make_air_parser("Builds, runs, and tests the herd_to_herd channel example") args = parser.parse_args() @@ -161,10 +132,12 @@ def herd_body(tx, ty, sx, sy): input_a = np.full(IMAGE_SIZE, 0x2, dtype=INOUT_DATATYPE) output_b = np.full(IMAGE_SIZE, 0x5, dtype=INOUT_DATATYPE) - runner = XRTRunner( - verbose=args.verbose, - output_format=args.output_format, - instance_name="copy", - runtime_loop_tiling_sizes=[4, 4], + exit( + run_on_npu( + args, + mlir_module, + inputs=[input_a], + instance_name="copy", + expected_outputs=[output_b], + ) ) - exit(runner.run_test(mlir_module, inputs=[input_a], expected_outputs=[output_b])) diff --git a/programming_examples/channel_examples/hierarchical/hierarchical.py b/programming_examples/channel_examples/hierarchical/hierarchical.py index 23752159b..380b975dd 100644 --- a/programming_examples/channel_examples/hierarchical/hierarchical.py +++ b/programming_examples/channel_examples/hierarchical/hierarchical.py @@ -1,6 +1,5 @@ # Copyright (C) 2024, Advanced Micro Devices, Inc. # SPDX-License-Identifier: MIT -import argparse import numpy as np from air.ir import * @@ -8,7 +7,7 @@ from air.dialects.memref import AllocOp, DeallocOp, load, store from air.dialects.func import FuncOp from air.dialects.scf import for_, yield_ -from air.backend.xrt_runner import XRTRunner, type_mapper +from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu range_ = for_ @@ -24,19 +23,8 @@ def build_module(): xrt_dtype = type_mapper(INOUT_DATATYPE) memrefTyInOut = MemRefType.get(IMAGE_SIZE, xrt_dtype) - mem_space_l1 = IntegerAttr.get(T.i32(), MemorySpace.L1) - mem_space_l2 = IntegerAttr.get(T.i32(), MemorySpace.L2) - - image_type_l1 = MemRefType.get( - shape=IMAGE_SIZE, - element_type=xrt_dtype, - memory_space=mem_space_l1, - ) - image_type_l2 = MemRefType.get( - shape=IMAGE_SIZE, - element_type=xrt_dtype, - memory_space=mem_space_l2, - ) + image_type_l1 = l1_memref_type(IMAGE_SIZE, xrt_dtype) + image_type_l2 = l2_memref_type(IMAGE_SIZE, xrt_dtype) Channel("ChanInL2") Channel("ChanOutL2") @@ -96,28 +84,7 @@ def herd_body(tx, ty, sx, sy): if __name__ == "__main__": - parser = argparse.ArgumentParser( - prog="run.py", - description="Builds, runs, and tests the channel hierarchical example", - ) - parser.add_argument( - "-v", - "--verbose", - action="store_true", - ) - parser.add_argument( - "-p", - "--print-module-only", - action="store_true", - ) - parser.add_argument( - "--output-format", - type=str, - choices=["xclbin", "elf"], - default="xclbin", - dest="output_format", - help="Output format for the compiled binary (default: xclbin)", - ) + parser = make_air_parser("Builds, runs, and tests the channel hierarchical example") args = parser.parse_args() @@ -133,14 +100,12 @@ def herd_body(tx, ty, sx, sy): IMAGE_SIZE ) - runner = XRTRunner( - verbose=args.verbose, - output_format=args.output_format, - instance_name="copy", - runtime_loop_tiling_sizes=[4, 4], - ) exit( - runner.run_test( - mlir_module, inputs=[input_matrix], expected_outputs=[output_matrix] + run_on_npu( + args, + mlir_module, + inputs=[input_matrix], + instance_name="copy", + expected_outputs=[output_matrix], ) ) diff --git a/programming_examples/channel_examples/worker_to_self/worker_to_self.py b/programming_examples/channel_examples/worker_to_self/worker_to_self.py index 196cf6a88..847a57f17 100644 --- a/programming_examples/channel_examples/worker_to_self/worker_to_self.py +++ b/programming_examples/channel_examples/worker_to_self/worker_to_self.py @@ -1,6 +1,5 @@ # Copyright (C) 2024, Advanced Micro Devices, Inc. # SPDX-License-Identifier: MIT -import argparse import numpy as np from air.ir import * @@ -8,7 +7,7 @@ from air.dialects.memref import AllocOp, DeallocOp, load, store from air.dialects.func import FuncOp from air.dialects.scf import for_, yield_ -from air.backend.xrt_runner import XRTRunner, type_mapper +from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu range_ = for_ @@ -29,19 +28,8 @@ def build_module(): Channel("ChanOut") Channel("ToSelf") - mem_space_l1 = IntegerAttr.get(T.i32(), MemorySpace.L1) - image_type_l1 = MemRefType.get( - shape=IMAGE_SIZE, - element_type=xrt_dtype, - memory_space=mem_space_l1, - ) - - mem_space_l2 = IntegerAttr.get(T.i32(), MemorySpace.L2) - image_type_l2 = MemRefType.get( - shape=IMAGE_SIZE, - element_type=xrt_dtype, - memory_space=mem_space_l2, - ) + image_type_l1 = l1_memref_type(IMAGE_SIZE, xrt_dtype) + image_type_l2 = l2_memref_type(IMAGE_SIZE, xrt_dtype) # We will send an image worth of data in and out @FuncOp.from_py_func(memrefTyInOut, memrefTyInOut) @@ -97,27 +85,8 @@ def herd_body(tx, ty, sx, sy, tensor_in_l2): if __name__ == "__main__": - parser = argparse.ArgumentParser( - prog="run.py", - description="Builds, runs, and tests the channel worker_to_self example", - ) - parser.add_argument( - "-v", - "--verbose", - action="store_true", - ) - parser.add_argument( - "-p", - "--print-module-only", - action="store_true", - ) - parser.add_argument( - "--output-format", - type=str, - choices=["xclbin", "elf"], - default="xclbin", - dest="output_format", - help="Output format for the compiled binary (default: xclbin)", + parser = make_air_parser( + "Builds, runs, and tests the channel worker_to_self example" ) args = parser.parse_args() @@ -132,14 +101,12 @@ def herd_body(tx, ty, sx, sy, tensor_in_l2): ) output_matrix = input_matrix.copy() - runner = XRTRunner( - verbose=args.verbose, - output_format=args.output_format, - instance_name="copy", - runtime_loop_tiling_sizes=[4, 4], - ) exit( - runner.run_test( - mlir_module, inputs=[input_matrix], expected_outputs=[output_matrix] + run_on_npu( + args, + mlir_module, + inputs=[input_matrix], + instance_name="copy", + expected_outputs=[output_matrix], ) ) diff --git a/programming_examples/channel_examples/worker_to_worker/worker_to_worker.py b/programming_examples/channel_examples/worker_to_worker/worker_to_worker.py index e49131fc7..6a756b310 100644 --- a/programming_examples/channel_examples/worker_to_worker/worker_to_worker.py +++ b/programming_examples/channel_examples/worker_to_worker/worker_to_worker.py @@ -1,6 +1,5 @@ # Copyright (C) 2024, Advanced Micro Devices, Inc. # SPDX-License-Identifier: MIT -import argparse import numpy as np from air.ir import * @@ -9,7 +8,7 @@ from air.dialects.func import FuncOp from air.dialects.scf import for_, yield_ from air.dialects.affine import apply as affine_apply -from air.backend.xrt_runner import XRTRunner, type_mapper +from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu range_ = for_ @@ -138,15 +137,8 @@ def herd_body(th, tw, sh, sw): ) th_next = affine_apply(get_tile_height_next, [tw, sw, th, sh]) - # We want to store our data in L1 memory - mem_space = IntegerAttr.get(T.i32(), MemorySpace.L1) - # This is the type definition of the tile - tile_type = MemRefType.get( - shape=TILE_SIZE, - element_type=xrt_dtype, - memory_space=mem_space, - ) + tile_type = l1_memref_type(TILE_SIZE, xrt_dtype) # We must allocate a buffer of tile size for the input/output tile_in = AllocOp(tile_type, [], []) @@ -195,27 +187,8 @@ def herd_body(th, tw, sh, sw): if __name__ == "__main__": - parser = argparse.ArgumentParser( - prog="run.py", - description="Builds, runs, and tests the channel worker_to_worker example", - ) - parser.add_argument( - "-v", - "--verbose", - action="store_true", - ) - parser.add_argument( - "-p", - "--print-module-only", - action="store_true", - ) - parser.add_argument( - "--output-format", - type=str, - choices=["xclbin", "elf"], - default="xclbin", - dest="output_format", - help="Output format for the compiled binary (default: xclbin)", + parser = make_air_parser( + "Builds, runs, and tests the channel worker_to_worker example" ) args = parser.parse_args() @@ -257,14 +230,12 @@ def get_next_tile_num(tile_height, tile_width): input_matrix[i, j] + tile_num_map[(i // TILE_HEIGHT, j // TILE_WIDTH)] ) - runner = XRTRunner( - verbose=args.verbose, - output_format=args.output_format, - instance_name="copy", - runtime_loop_tiling_sizes=[4, 4], - ) exit( - runner.run_test( - mlir_module, inputs=[input_matrix], expected_outputs=[output_matrix] + run_on_npu( + args, + mlir_module, + inputs=[input_matrix], + instance_name="copy", + expected_outputs=[output_matrix], ) ) diff --git a/programming_examples/conditional_branching/single_core.py b/programming_examples/conditional_branching/single_core.py index 5645c2986..1aa79eabf 100644 --- a/programming_examples/conditional_branching/single_core.py +++ b/programming_examples/conditional_branching/single_core.py @@ -1,6 +1,5 @@ # Copyright (C) 2025, Advanced Micro Devices, Inc. # SPDX-License-Identifier: MIT -import argparse from math import cos, sin from air.ir import * @@ -10,8 +9,7 @@ from air.dialects.func import FuncOp, CallOp from air.dialects import scf from air.dialects.scf import for_, yield_ -from air.backend.xrt_runner import XRTRunner, type_mapper -from air.backend.xrt import XRTBackend +from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu range_ = for_ @@ -47,17 +45,8 @@ def segment_body( _l3_out_data, ): # L2 MemRefTypes - l2_mem_space = IntegerAttr.get(T.i32(), MemorySpace.L2) - l2MemrefTyIn = MemRefType.get( - shape=[n], - element_type=xrt_dtype_in, - memory_space=l2_mem_space, - ) - l2MemrefTyOut = MemRefType.get( - shape=[n], - element_type=xrt_dtype_in, - memory_space=l2_mem_space, - ) + l2MemrefTyIn = l2_memref_type([n], xrt_dtype_in) + l2MemrefTyOut = l2_memref_type([n], xrt_dtype_in) l2_in_data = AllocOp(l2MemrefTyIn, [], []) l2_out_data = AllocOp(l2MemrefTyOut, [], []) dma_memcpy_nd( @@ -76,17 +65,8 @@ def herd_body_0( _tx, _ty, _sx, _sy, _l2_in_data, _l2_out_data, _param_arg ): - l1_mem_space = IntegerAttr.get(T.i32(), MemorySpace.L1) - l1MemrefTyIn = MemRefType.get( - shape=[n], - element_type=xrt_dtype_in, - memory_space=l1_mem_space, - ) - l1MemrefTyOut = MemRefType.get( - shape=[n], - element_type=xrt_dtype_in, - memory_space=l1_mem_space, - ) + l1MemrefTyIn = l1_memref_type([n], xrt_dtype_in) + l1MemrefTyOut = l1_memref_type([n], xrt_dtype_in) l1_in_data = AllocOp(l1MemrefTyIn, [], []) dma_memcpy_nd( @@ -141,34 +121,13 @@ def herd_body_0( INPUT_DATATYPE = np.int32 OUTPUT_DATATYPE = np.int32 - parser = argparse.ArgumentParser( - prog="run.py", - description="Builds, runs, and tests the passthrough_dma example", - ) - parser.add_argument( - "-v", - "--verbose", - action="store_true", - ) - parser.add_argument( - "-p", - "--print-module-only", - action="store_true", - ) + parser = make_air_parser("Builds, runs, and tests the passthrough_dma example") parser.add_argument( "--n", type=int, default=N, help="N dimension size in a (1xK) * (KxN) matmul", ) - parser.add_argument( - "--output-format", - type=str, - choices=["xclbin", "elf"], - default="xclbin", - dest="output_format", - help="Output format for the compiled binary (default: xclbin)", - ) args = parser.parse_args() @@ -188,16 +147,13 @@ def herd_body_0( inputs = np.arange(0, args.n, dtype=INPUT_DATATYPE).reshape(args.n) outputs = inputs * 100 - runner = XRTRunner( - verbose=args.verbose, - output_format=args.output_format, - instance_name="conditional_branch", - runtime_loop_tiling_sizes=[4, 4], - ) - res0 = runner.run_test( + res0 = run_on_npu( + args, mlir_module, inputs=[inputs], + instance_name="conditional_branch", expected_outputs=[outputs], + runtime_loop_tiling_sizes=[4, 4], ) ###### Compile and test, param = 1 @@ -211,10 +167,13 @@ def herd_body_0( ) outputs = inputs + 100 - res1 = runner.run_test( + res1 = run_on_npu( + args, mlir_module, inputs=[inputs], + instance_name="conditional_branch", expected_outputs=[outputs], + runtime_loop_tiling_sizes=[4, 4], ) if res0 == 0 and res1 == 0: print("Both conditions PASS!") diff --git a/programming_examples/conv2d/conv2d.py b/programming_examples/conv2d/conv2d.py index af591c6a3..cdbab2b47 100644 --- a/programming_examples/conv2d/conv2d.py +++ b/programming_examples/conv2d/conv2d.py @@ -18,16 +18,14 @@ 3. DMA output tile from L1 to L3 """ -import argparse - +import numpy as np from air.ir import * from air.dialects.air import * from air.dialects import arith from air.dialects.memref import AllocOp, DeallocOp, load, store from air.dialects.func import FuncOp from air.dialects.scf import for_, yield_ -from air.backend.xrt_runner import XRTRunner, type_mapper -from air.backend.xrt import XRTBackend +from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu range_ = for_ @@ -56,10 +54,9 @@ def build_module(H, W, Ci, Co, Kh, Kw, np_dtype): # L1 types: drop the batch dimension (N=1) since we process one # sample. The DMA copies the full extent which matches because N=1. - l1_mem_space = IntegerAttr.get(T.i32(), MemorySpace.L1) - l1InTy = MemRefType.get([H, W, Ci], xrt_dtype, memory_space=l1_mem_space) - l1FilterTy = MemRefType.get([Kh, Kw, Ci, Co], xrt_dtype, memory_space=l1_mem_space) - l1OutTy = MemRefType.get([Ho, Wo, Co], xrt_dtype, memory_space=l1_mem_space) + l1InTy = l1_memref_type([H, W, Ci], xrt_dtype) + l1FilterTy = l1_memref_type([Kh, Kw, Ci, Co], xrt_dtype) + l1OutTy = l1_memref_type([Ho, Wo, Co], xrt_dtype) @FuncOp.from_py_func(l3InTy, l3FilterTy, l3OutTy) def conv2d(arg_in, arg_filter, arg_out): @@ -122,30 +119,11 @@ def herd_body(_tx, _ty, _sx, _sy, l3_in, l3_filter, l3_out): if __name__ == "__main__": INPUT_DATATYPE = np.int32 - parser = argparse.ArgumentParser( - prog="run.py", - description="Builds, runs, and tests the 2D convolution example", - ) - parser.add_argument("-v", "--verbose", action="store_true") - parser.add_argument("-p", "--print-module-only", action="store_true") + parser = make_air_parser("Builds, runs, and tests the 2D convolution example") parser.add_argument("--H", type=int, default=H_DEFAULT, help="Input height") parser.add_argument("--W", type=int, default=W_DEFAULT, help="Input width") parser.add_argument("--Ci", type=int, default=CI_DEFAULT, help="Input channels") parser.add_argument("--Co", type=int, default=CO_DEFAULT, help="Output channels") - parser.add_argument( - "--compile-mode", - type=str, - choices=["compile-only", "compile-and-run"], - dest="compile_mode", - default="compile-and-run", - ) - parser.add_argument( - "--output-format", - type=str, - choices=["xclbin", "elf"], - default="xclbin", - dest="output_format", - ) args = parser.parse_args() Ho = args.H - KH + 1 @@ -177,28 +155,12 @@ def herd_body(_tx, _ty, _sx, _sy, l3_in, l3_filter, l3_out): * filter_data[kh, kw, ci, co] ) - if args.compile_mode == "compile-and-run": - runner = XRTRunner( - verbose=args.verbose, - omit_while_true_loop=False, - output_format=args.output_format, - instance_name="conv2d", - runtime_loop_tiling_sizes=[4, 4], - ) - exit( - runner.run_test( - mlir_module, - inputs=[input_data, filter_data], - expected_outputs=[output_ref], - ) - ) - - elif args.compile_mode == "compile-only": - backend = XRTBackend( - verbose=args.verbose, - omit_while_true_loop=False, - output_format=args.output_format, - runtime_loop_tiling_sizes=[4, 4], - ) - module_function = backend.compile(mlir_module) - backend.unload() + run_on_npu( + args, + mlir_module, + inputs=[input_data, filter_data], + expected_outputs=[output_ref], + instance_name="conv2d", + omit_while_true_loop=False, + runtime_loop_tiling_sizes=[4, 4], + ) diff --git a/programming_examples/data_transfer_transpose/channel/transpose.py b/programming_examples/data_transfer_transpose/channel/transpose.py index d27e74b88..1d3b2ae12 100644 --- a/programming_examples/data_transfer_transpose/channel/transpose.py +++ b/programming_examples/data_transfer_transpose/channel/transpose.py @@ -1,6 +1,5 @@ # Copyright (C) 2024, Advanced Micro Devices, Inc. # SPDX-License-Identifier: MIT -import argparse import numpy as np np.random.seed(42) @@ -9,7 +8,7 @@ from air.dialects.air import * from air.dialects.memref import AllocOp, DeallocOp from air.dialects.func import FuncOp -from air.backend.xrt_runner import XRTRunner, type_mapper +from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu dtype_map = { "uint32": np.uint32, @@ -43,15 +42,10 @@ def launch_body(a, b): def segment_body(): @herd(name="herd", sizes=[1, 1]) def herd_body(_tx, _ty, _sx, _sy): - # We want to store our data in L1 memory - mem_space = IntegerAttr.get(T.i32(), MemorySpace.L1) - # This is the type definition of the tensor - tensor_type = MemRefType.get( - shape=[k * m], # Read as one large array - element_type=xrt_dtype, - memory_space=mem_space, - ) + tensor_type = l1_memref_type( + [k * m], xrt_dtype + ) # Read as one large array # We must allocate a buffer of tile size for the input/output tensor_in = AllocOp(tensor_type, [], []) @@ -63,15 +57,8 @@ def herd_body(_tx, _ty, _sx, _sy): if __name__ == "__main__": - parser = argparse.ArgumentParser( - prog="run.py", - description="Builds, runs, and tests the matrix_scalar_add/single_core_channel example", - ) - - parser.add_argument( - "-v", - "--verbose", - action="store_true", + parser = make_air_parser( + "Builds, runs, and tests the matrix_scalar_add/single_core_channel example" ) parser.add_argument( "-m", @@ -92,19 +79,6 @@ def herd_body(_tx, _ty, _sx, _sy): choices=dtype_map.keys(), help="The data type of the matrix", ) - parser.add_argument( - "-p", - "--print-module-only", - action="store_true", - ) - parser.add_argument( - "--output-format", - type=str, - choices=["xclbin", "elf"], - default="xclbin", - dest="output_format", - help="Output format for the compiled binary (default: xclbin)", - ) args = parser.parse_args() @@ -136,16 +110,12 @@ def herd_body(_tx, _ty, _sx, _sy): ) expected_output_matrix = np.transpose(input_matrix) - runner = XRTRunner( - verbose=args.verbose, - output_format=args.output_format, - instance_name="transpose", - runtime_loop_tiling_sizes=[4, 4], - ) exit( - runner.run_test( + run_on_npu( + args, mlir_module, inputs=[input_matrix], + instance_name="transpose", expected_outputs=[expected_output_matrix], ) ) diff --git a/programming_examples/data_transfer_transpose/dma/transpose.py b/programming_examples/data_transfer_transpose/dma/transpose.py index 094e4a0c2..b5c7a9ffd 100644 --- a/programming_examples/data_transfer_transpose/dma/transpose.py +++ b/programming_examples/data_transfer_transpose/dma/transpose.py @@ -1,6 +1,5 @@ # Copyright (C) 2024, Advanced Micro Devices, Inc. # SPDX-License-Identifier: MIT -import argparse import numpy as np np.random.seed(42) @@ -9,7 +8,7 @@ from air.dialects.air import * from air.dialects.memref import AllocOp, DeallocOp from air.dialects.func import FuncOp -from air.backend.xrt_runner import XRTRunner, type_mapper +from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu dtype_map = { "uint32": np.uint32, @@ -34,15 +33,10 @@ def launch_body(a, b): def segment_body(arg2, arg3): @herd(name="herd", sizes=[1, 1], operands=[arg2, arg3]) def herd_body(_tx, _ty, _sx, _sy, a, b): - # We want to store our data in L1 memory - mem_space = IntegerAttr.get(T.i32(), MemorySpace.L1) - # This is the type definition of the tensor - tensor_type = MemRefType.get( - shape=[m * k], # Read as one large array - element_type=xrt_dtype, - memory_space=mem_space, - ) + tensor_type = l1_memref_type( + [m * k], xrt_dtype + ) # Read as one large array # We must allocate a buffer of tile size for the input/output tensor_in = AllocOp(tensor_type, [], []) @@ -69,15 +63,8 @@ def herd_body(_tx, _ty, _sx, _sy, a, b): if __name__ == "__main__": - parser = argparse.ArgumentParser( - prog="run.py", - description="Builds, runs, and tests the matrix_scalar_add/single_core_channel example", - ) - - parser.add_argument( - "-v", - "--verbose", - action="store_true", + parser = make_air_parser( + "Builds, runs, and tests the matrix_scalar_add/single_core_channel example" ) parser.add_argument( "-m", @@ -98,19 +85,6 @@ def herd_body(_tx, _ty, _sx, _sy, a, b): choices=dtype_map.keys(), help="The data type of the matrix", ) - parser.add_argument( - "-p", - "--print-module-only", - action="store_true", - ) - parser.add_argument( - "--output-format", - type=str, - choices=["xclbin", "elf"], - default="xclbin", - dest="output_format", - help="Output format for the compiled binary (default: xclbin)", - ) args = parser.parse_args() @@ -142,16 +116,12 @@ def herd_body(_tx, _ty, _sx, _sy, a, b): ) expected_output_matrix = np.transpose(input_matrix) - runner = XRTRunner( - verbose=args.verbose, - output_format=args.output_format, - instance_name="transpose", - runtime_loop_tiling_sizes=[4, 4], - ) exit( - runner.run_test( + run_on_npu( + args, mlir_module, inputs=[input_matrix], + instance_name="transpose", expected_outputs=[expected_output_matrix], ) ) diff --git a/programming_examples/data_transfer_transpose/dma_bf16/transpose_bf16.py b/programming_examples/data_transfer_transpose/dma_bf16/transpose_bf16.py index cc4ef215c..b8ea421b1 100644 --- a/programming_examples/data_transfer_transpose/dma_bf16/transpose_bf16.py +++ b/programming_examples/data_transfer_transpose/dma_bf16/transpose_bf16.py @@ -12,7 +12,6 @@ perform the transpose. """ -import argparse import numpy as np from ml_dtypes import bfloat16 @@ -22,8 +21,7 @@ from air.dialects.air import * from air.dialects.memref import AllocOp, DeallocOp from air.dialects.func import FuncOp -from air.backend.xrt_runner import XRTRunner, type_mapper -from air.backend.xrt import XRTBackend +from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu INOUT_DATATYPE = bfloat16 @@ -35,12 +33,7 @@ def build_module(m, k): memrefTyIn = MemRefType.get(shape=[m * k], element_type=xrt_dtype) memrefTyOut = MemRefType.get(shape=[k * m], element_type=xrt_dtype) - mem_space = IntegerAttr.get(T.i32(), MemorySpace.L1) - l1_type = MemRefType.get( - shape=[m * k], - element_type=xrt_dtype, - memory_space=mem_space, - ) + l1_type = l1_memref_type([m * k], xrt_dtype) transpose_func = external_func("transpose_bf16", inputs=[l1_type, l1_type]) @@ -78,28 +71,9 @@ def herd_body(_tx, _ty, _sx, _sy, a, b): M = 64 K = 32 - parser = argparse.ArgumentParser( - prog="run.py", - description="Builds, runs, and tests the bf16 transpose example", - ) - parser.add_argument("-v", "--verbose", action="store_true") - parser.add_argument("-p", "--print-module-only", action="store_true") + parser = make_air_parser("Builds, runs, and tests the bf16 transpose example") parser.add_argument("-m", type=int, default=M, help="Matrix rows") parser.add_argument("-k", type=int, default=K, help="Matrix columns") - parser.add_argument( - "--compile-mode", - type=str, - choices=["compile-only", "compile-and-run"], - dest="compile_mode", - default="compile-and-run", - ) - parser.add_argument( - "--output-format", - type=str, - choices=["xclbin", "elf"], - default="xclbin", - dest="output_format", - ) args = parser.parse_args() mlir_module = build_module(args.m, args.k) @@ -110,25 +84,12 @@ def herd_body(_tx, _ty, _sx, _sy, a, b): input_matrix = np.random.uniform(-1.0, 1.0, (args.m, args.k)).astype(INOUT_DATATYPE) expected_output = np.transpose(input_matrix) - if args.compile_mode == "compile-and-run": - runner = XRTRunner( - verbose=args.verbose, - output_format=args.output_format, + exit( + run_on_npu( + args, + mlir_module, + inputs=[input_matrix.reshape(-1)], instance_name="transpose", - runtime_loop_tiling_sizes=[4, 4], + expected_outputs=[expected_output.reshape(-1)], ) - exit( - runner.run_test( - mlir_module, - inputs=[input_matrix.reshape(-1)], - expected_outputs=[expected_output.reshape(-1)], - ) - ) - elif args.compile_mode == "compile-only": - backend = XRTBackend( - verbose=args.verbose, - output_format=args.output_format, - runtime_loop_tiling_sizes=[4, 4], - ) - module_function = backend.compile(mlir_module) - backend.unload() + ) diff --git a/programming_examples/dequant_awq/dequant_awq.py b/programming_examples/dequant_awq/dequant_awq.py index 6e0c2a23a..fa215aaf6 100644 --- a/programming_examples/dequant_awq/dequant_awq.py +++ b/programming_examples/dequant_awq/dequant_awq.py @@ -21,8 +21,7 @@ from air.dialects.air import * from air.dialects.memref import AllocOp, DeallocOp from air.dialects.func import FuncOp, CallOp -from air.backend.xrt_runner import XRTRunner, type_mapper -from air.backend.xrt import XRTBackend +from air.backend.xrt_runner import type_mapper, run_on_npu @module_builder @@ -149,29 +148,16 @@ def herd_body(_tx, _ty, _sx, _sy, hw, hp, ho): packed_i8 = packed_weights.view(np.int8) - if args.compile_mode == "compile-and-run": - runner = XRTRunner( - verbose=args.verbose, - omit_pingpong=True, - output_format=args.output_format, + exit( + run_on_npu( + args, + mlir_module, + inputs=[packed_i8, params], instance_name="dequant", + expected_outputs=[ref_output], + rtol=1e-1, + atol=5e-2, runtime_loop_tiling_sizes=[4, 4], - ) - exit( - runner.run_test( - mlir_module, - inputs=[packed_i8, params], - expected_outputs=[ref_output], - rtol=1e-1, - atol=5e-2, - ) - ) - elif args.compile_mode == "compile-only": - backend = XRTBackend( - verbose=args.verbose, omit_pingpong=True, - output_format=args.output_format, - runtime_loop_tiling_sizes=[4, 4], ) - module_function = backend.compile(mlir_module) - backend.unload() + ) diff --git a/programming_examples/eltwise_add/eltwise_add.py b/programming_examples/eltwise_add/eltwise_add.py index 75b7398c4..f354e29de 100644 --- a/programming_examples/eltwise_add/eltwise_add.py +++ b/programming_examples/eltwise_add/eltwise_add.py @@ -11,7 +11,10 @@ configurable VECTOR_SIZE (default 16 for BF16, 8 for F32). """ -import argparse +import os +import sys + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from ml_dtypes import bfloat16 @@ -24,8 +27,7 @@ from air.dialects.vector import transfer_read, transfer_write from air.dialects.func import FuncOp from air.dialects.scf import for_, yield_ -from air.backend.xrt_runner import XRTRunner, type_mapper -from air.backend.xrt import XRTBackend +from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu import numpy as np @@ -51,15 +53,8 @@ def build_module( n % (tile_n * total_tiles) == 0 ), f"n ({n}) must be divisible by tile_n*total_tiles ({tile_n}*{total_tiles}={tile_n*total_tiles})" - # L3 MemRefTypes l3memrefTy = MemRefType.get(a_size, xrt_dtype_in) - - # L1 MemRefTypes - l1MemrefTy = MemRefType.get( - shape=[tile_n], - element_type=xrt_dtype_in, - memory_space=IntegerAttr.get(T.i32(), MemorySpace.L1), - ) + l1MemrefTy = l1_memref_type([tile_n], xrt_dtype_in) # Vectorization setup vectorize = vector_size > 0 @@ -67,8 +62,8 @@ def build_module( assert ( tile_n % vector_size == 0 ), f"tile_n ({tile_n}) must be divisible by vector_size ({vector_size})" - vecTy = VectorType.get([vector_size], xrt_dtype_in) - identity_map = AffineMapAttr.get(AffineMap.get_identity(1)) + vecTy = vec_type(vector_size, xrt_dtype_in) + imap = identity_map_attr() index_type = IndexType.get() @FuncOp.from_py_func(l3memrefTy, l3memrefTy, l3memrefTy) @@ -147,14 +142,10 @@ def herd_body( sub_a = subview(l1_a_data.result, [j], [vector_size], [1]) sub_b = subview(l1_b_data.result, [j], [vector_size], [1]) sub_c = subview(l1_out_data.result, [j], [vector_size], [1]) - v_a = transfer_read( - vecTy, sub_a, [c0], identity_map, cst0, [True] - ) - v_b = transfer_read( - vecTy, sub_b, [c0], identity_map, cst0, [True] - ) + v_a = transfer_read(vecTy, sub_a, [c0], imap, cst0, [True]) + v_b = transfer_read(vecTy, sub_b, [c0], imap, cst0, [True]) v_c = arith.AddFOp(v_a, v_b) - transfer_write(None, v_c, sub_c, [c0], identity_map, [True]) + transfer_write(None, v_c, sub_c, [c0], imap, [True]) yield_([]) else: # Scalar compute loop (original) @@ -190,20 +181,7 @@ def herd_body( VECTOR_SIZE = 16 NUM_TILES = 2 - parser = argparse.ArgumentParser( - prog="run.py", - description="Builds, runs, and tests the eltwise_add example", - ) - parser.add_argument( - "-v", - "--verbose", - action="store_true", - ) - parser.add_argument( - "-p", - "--print-module-only", - action="store_true", - ) + parser = make_air_parser("Builds, runs, and tests the eltwise_add example") parser.add_argument( "--n", type=int, @@ -242,22 +220,6 @@ def herd_body( default="bf16", help="Data type (default: bf16)", ) - parser.add_argument( - "--compile-mode", - type=str, - choices=["compile-only", "compile-and-run"], - dest="compile_mode", - default="compile-and-run", - help="Configure to whether to run after compile", - ) - parser.add_argument( - "--output-format", - type=str, - choices=["xclbin", "elf"], - default="xclbin", - dest="output_format", - help="Output format for the compiled binary (default: xclbin)", - ) args = parser.parse_args() if args.dtype == "bf16": @@ -281,57 +243,36 @@ def herd_body( input_a = np.random.uniform(0, 4, args.n).astype(INPUT_DATATYPE) input_b = np.random.uniform(0, 4, args.n).astype(INPUT_DATATYPE) - if args.compile_mode == "compile-and-run": - - # Stochastically sample num_sample results, and pass to XRTRunner backend for verification. - num_samples = 100 - sampled_indices = np.vstack( - [ - np.random.randint(0, args.n, num_samples), # i indices - ] - ) + # Stochastically sample num_sample results, and pass to XRTRunner backend for verification. + num_samples = 100 + sampled_indices = np.vstack( + [ + np.random.randint(0, args.n, num_samples), # i indices + ] + ) - # Compute reference results for sampled indices - sampled_values = np.array( - [input_a[i] + input_b[i] for i in zip(*sampled_indices)], - dtype=INPUT_DATATYPE, - ) + # Compute reference results for sampled indices + sampled_values = np.array( + [input_a[i] + input_b[i] for i in zip(*sampled_indices)], + dtype=INPUT_DATATYPE, + ) - # Store as a dictionary - sampled_data = { - "shape": (args.n), - "indices": sampled_indices, - "values": sampled_values, - } - - ###### Compile and test - runner = XRTRunner( - verbose=args.verbose, - omit_while_true_loop=False, - output_format=args.output_format, + # Store as a dictionary + sampled_data = { + "shape": (args.n), + "indices": sampled_indices, + "values": sampled_values, + } + + # BF16 has ~0.8% relative precision; use looser tolerance + rtol = 0.01 if INPUT_DATATYPE == bfloat16 else 1e-3 + exit( + run_on_npu( + args, + mlir_module, + inputs=[input_a, input_b], instance_name="eltwise_add", - runtime_loop_tiling_sizes=[4, 4], - ) - # BF16 has ~0.8% relative precision; use looser tolerance - rtol = 0.01 if INPUT_DATATYPE == bfloat16 else 1e-3 - exit( - runner.run_test( - mlir_module, - inputs=[input_a, input_b], - stochastic_expected_outputs=[sampled_data], - rtol=rtol, - ) - ) - - elif args.compile_mode == "compile-only": - ###### Compile only - backend = XRTBackend( - verbose=args.verbose, - omit_while_true_loop=False, - omit_auto_broadcast=True, - output_format=args.output_format, - runtime_loop_tiling_sizes=[4, 4], + stochastic_expected_outputs=[sampled_data], + rtol=rtol, ) - module_function = backend.compile(mlir_module) - - backend.unload() + ) diff --git a/programming_examples/eltwise_add_with_l2/eltwise_add.py b/programming_examples/eltwise_add_with_l2/eltwise_add.py index 8b87ae7da..9fc393cf6 100644 --- a/programming_examples/eltwise_add_with_l2/eltwise_add.py +++ b/programming_examples/eltwise_add_with_l2/eltwise_add.py @@ -1,17 +1,14 @@ # Copyright (C) 2025, Advanced Micro Devices, Inc. # SPDX-License-Identifier: MIT -import argparse from ml_dtypes import bfloat16 from air.ir import * -from air.dialects.affine import apply as affine_apply from air.dialects.air import * from air.dialects.arith import ConstantOp from air.dialects.memref import AllocOp, DeallocOp, load, store from air.dialects.func import FuncOp from air.dialects.scf import for_, yield_ -from air.backend.xrt_runner import XRTRunner, type_mapper -from air.backend.xrt import XRTBackend +from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu import numpy as np @@ -33,18 +30,10 @@ def build_module(n, tile_n, np_dtype_in): l3memrefTy = MemRefType.get(a_size, xrt_dtype_in) # L2 MemRefTypes - l2MemrefTy = MemRefType.get( - shape=a_size, - element_type=xrt_dtype_in, - memory_space=IntegerAttr.get(T.i32(), MemorySpace.L2), - ) + l2MemrefTy = l2_memref_type(a_size, xrt_dtype_in) # L1 MemRefTypes - l1MemrefTy = MemRefType.get( - shape=[tile_n], - element_type=xrt_dtype_in, - memory_space=IntegerAttr.get(T.i32(), MemorySpace.L1), - ) + l1MemrefTy = l1_memref_type([tile_n], xrt_dtype_in) @FuncOp.from_py_func(l3memrefTy, l3memrefTy, l3memrefTy) def eltwise_add(arg0, arg1, arg2): @@ -102,20 +91,7 @@ def herd_body( for _l_ivx in range_(0, n, tile_n * num_tiles): - offset_map = AffineMap.get( - 0, - 2, - [ - AffineExpr.get_add( - AffineSymbolExpr.get(0), - AffineExpr.get_mul( - AffineSymbolExpr.get(1), - AffineConstantExpr.get(tile_n), - ), - ) - ], - ) - offset = affine_apply(offset_map, [_l_ivx, _ty]) + offset = tile_offset_1d(_l_ivx, _ty, tile_n) dma_memcpy_nd( l1_a_data, @@ -171,20 +147,7 @@ def herd_body( TILE_N = 1024 INPUT_DATATYPE = np.float32 - parser = argparse.ArgumentParser( - prog="run.py", - description="Builds, runs, and tests the passthrough_dma example", - ) - parser.add_argument( - "-v", - "--verbose", - action="store_true", - ) - parser.add_argument( - "-p", - "--print-module-only", - action="store_true", - ) + parser = make_air_parser("Builds, runs, and tests the passthrough_dma example") parser.add_argument( "--n", type=int, @@ -192,22 +155,6 @@ def herd_body( help="Total number of elements", ) parser.add_argument("--tile-n", type=int, default=TILE_N, help="Tile size") - parser.add_argument( - "--compile-mode", - type=str, - choices=["compile-only", "compile-and-run"], - dest="compile_mode", - default="compile-and-run", - help="Configure to whether to run after compile", - ) - parser.add_argument( - "--output-format", - type=str, - choices=["xclbin", "elf"], - default="xclbin", - dest="output_format", - help="Output format for the compiled binary (default: xclbin)", - ) args = parser.parse_args() @@ -225,55 +172,34 @@ def herd_body( input_b = np.arange(0, args.n, dtype=np.int64).reshape(args.n) input_b = input_b.astype(INPUT_DATATYPE) - if args.compile_mode == "compile-and-run": - - # Stochastically sample num_sample results, and pass to XRTRunner backend for verification. - num_samples = 100 - sampled_indices = np.vstack( - [ - np.random.randint(0, args.n, num_samples), # i indices - ] - ) - - # Compute reference results for sampled indices - sampled_values = np.array( - [input_a[i] + input_b[i] for i in zip(*sampled_indices)], - dtype=INPUT_DATATYPE, - ) - - # Store as a dictionary - sampled_data = { - "shape": (args.n), - "indices": sampled_indices, - "values": sampled_values, - } - - ###### Compile and test - runner = XRTRunner( - verbose=args.verbose, - omit_while_true_loop=False, - output_format=args.output_format, - instance_name="eltwise_add", - runtime_loop_tiling_sizes=[4, 4], - ) - exit( - runner.run_test( - mlir_module, - inputs=[input_a, input_b], - stochastic_expected_outputs=[sampled_data], - rtol=1e-3, - ) - ) + # Stochastically sample num_sample results, and pass to XRTRunner backend for verification. + num_samples = 100 + sampled_indices = np.vstack( + [ + np.random.randint(0, args.n, num_samples), # i indices + ] + ) - elif args.compile_mode == "compile-only": - ###### Compile only - backend = XRTBackend( - verbose=args.verbose, - omit_while_true_loop=False, - omit_auto_broadcast=True, - output_format=args.output_format, - runtime_loop_tiling_sizes=[4, 4], - ) - module_function = backend.compile(mlir_module) + # Compute reference results for sampled indices + sampled_values = np.array( + [input_a[i] + input_b[i] for i in zip(*sampled_indices)], + dtype=INPUT_DATATYPE, + ) - backend.unload() + # Store as a dictionary + sampled_data = { + "shape": (args.n), + "indices": sampled_indices, + "values": sampled_values, + } + + run_on_npu( + args, + mlir_module, + inputs=[input_a, input_b], + stochastic_expected_outputs=[sampled_data], + instance_name="eltwise_add", + omit_while_true_loop=False, + runtime_loop_tiling_sizes=[4, 4], + rtol=1e-3, + ) diff --git a/programming_examples/ffn_swiglu/decode/ffn_decode.py b/programming_examples/ffn_swiglu/decode/ffn_decode.py index c96eb340c..615c179b7 100644 --- a/programming_examples/ffn_swiglu/decode/ffn_decode.py +++ b/programming_examples/ffn_swiglu/decode/ffn_decode.py @@ -30,8 +30,7 @@ from air.dialects.memref import AllocOp, DeallocOp from air.dialects.func import FuncOp, CallOp from air.dialects.scf import for_, yield_ -from air.backend.xrt_runner import XRTRunner, type_mapper -from air.backend.xrt import XRTBackend +from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu range_ = for_ @@ -266,6 +265,13 @@ def h(_tx, _ty, _sx, _sy, hi, hw, ho): default="elf", dest="output_format", ) + parser.add_argument( + "--compile-mode", + type=str, + choices=["compile-only", "compile-and-run"], + dest="compile_mode", + default="compile-and-run", + ) args = parser.parse_args() dim = args.dim @@ -312,20 +318,16 @@ def pack_weights_partitioned(W, dim, dim_m, num_cols): intermediate = silu_gate * up ref_out = (W_down.astype(np.float32) @ intermediate).astype(INPUT_DATATYPE) - runner = XRTRunner( - verbose=args.verbose, - omit_while_true_loop=False, - omit_pingpong=True, - output_format=args.output_format, - instance_name="ffn_swiglu", - runtime_loop_tiling_sizes=[4, 4], - ) exit( - runner.run_test( + run_on_npu( + args, mlir_module, inputs=[x, packed_weights, gate_buf, up_buf, inter_buf], + instance_name="ffn_swiglu", expected_outputs=[ref_out], rtol=1e0, atol=0.5, + runtime_loop_tiling_sizes=[4, 4], + omit_pingpong=True, ) ) diff --git a/programming_examples/ffn_swiglu/prefill/ffn_prefill.py b/programming_examples/ffn_swiglu/prefill/ffn_prefill.py index 5309485b7..393c5bade 100644 --- a/programming_examples/ffn_swiglu/prefill/ffn_prefill.py +++ b/programming_examples/ffn_swiglu/prefill/ffn_prefill.py @@ -30,8 +30,7 @@ from air.dialects.memref import AllocOp, DeallocOp from air.dialects.func import FuncOp, CallOp from air.dialects.scf import for_, yield_ -from air.backend.xrt_runner import XRTRunner, type_mapper -from air.backend.xrt import XRTBackend +from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu range_ = for_ @@ -268,6 +267,13 @@ def h(_tx, _ty, _sx, _sy, hi, hw, ho): default="elf", dest="output_format", ) + parser.add_argument( + "--compile-mode", + type=str, + choices=["compile-only", "compile-and-run"], + dest="compile_mode", + default="compile-and-run", + ) args = parser.parse_args() seq_len = args.seq_len @@ -321,16 +327,9 @@ def pack_weights(W, dim, dim_n, num_cols): intermediate = silu_gate * up ref_out = (intermediate @ W_down.astype(np.float32).T).astype(INPUT_DATATYPE) - runner = XRTRunner( - verbose=args.verbose, - omit_while_true_loop=False, - omit_pingpong=True, - runtime_loop_tiling_sizes=[4, 4], - output_format=args.output_format, - instance_name="ffn_swiglu", - ) exit( - runner.run_test( + run_on_npu( + args, mlir_module, inputs=[ x.reshape(-1), @@ -339,8 +338,11 @@ def pack_weights(W, dim, dim_n, num_cols): up_buf, inter_buf, ], + instance_name="ffn_swiglu", expected_outputs=[ref_out.reshape(-1)], rtol=1e0, atol=0.5, + runtime_loop_tiling_sizes=[4, 4], + omit_pingpong=True, ) ) diff --git a/programming_examples/flash_attention/dataflow_based/attn.py b/programming_examples/flash_attention/dataflow_based/attn.py index d31f338c5..b69c27ebe 100644 --- a/programming_examples/flash_attention/dataflow_based/attn.py +++ b/programming_examples/flash_attention/dataflow_based/attn.py @@ -12,8 +12,7 @@ from air.dialects.func import FuncOp, CallOp from air.dialects.scf import for_, yield_ from air.dialects import scf, affine, arith -from air.backend.xrt_runner import XRTRunner, type_mapper -from air.backend.xrt import XRTBackend +from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu from ml_dtypes import bfloat16 range_ = for_ @@ -733,6 +732,13 @@ def herd_body_final(arg22, arg23, arg24, arg25, arg27, arg28, arg29): default="aie2", help="Target architecture (default: aie2)", ) + parser.add_argument( + "--compile-mode", + type=str, + choices=["compile-only", "compile-and-run"], + dest="compile_mode", + default="compile-and-run", + ) args = parser.parse_args() @@ -753,9 +759,6 @@ def herd_body_final(arg22, arg23, arg24, arg25, arg27, arg28, arg29): print(mlir_module) exit(0) - # Import XRT dependencies only when running tests - from air.backend.xrt_runner import XRTRunner, type_mapper - from air.backend.xrt import XRTBackend from air.extras import types as extrasT from ml_dtypes import bfloat16 @@ -800,19 +803,15 @@ def herd_body_final(arg22, arg23, arg24, arg25, arg27, arg28, arg29): lazy_attn_output = (Gp / sp).astype(OUTPUT_DATATYPE) - runner = XRTRunner( - omit_while_true_loop=False, - omit_pingpong=True, - verbose=False, - runtime_loop_tiling_sizes=[1, 1], - output_format=args.output_format, - instance_name="attention_bf16", - ) exit( - runner.run_test( + run_on_npu( + args, mlir_module, inputs=[input_q_scaled, input_k, input_v, input_m], + instance_name="attention_bf16", expected_outputs=[lazy_attn_output], rtol=1e-1, + runtime_loop_tiling_sizes=[1, 1], + omit_pingpong=True, ) ) diff --git a/programming_examples/flash_attention/kernel_fusion_based/attn.py b/programming_examples/flash_attention/kernel_fusion_based/attn.py index 2b9882a9d..0552cf5a8 100644 --- a/programming_examples/flash_attention/kernel_fusion_based/attn.py +++ b/programming_examples/flash_attention/kernel_fusion_based/attn.py @@ -50,6 +50,7 @@ from air.dialects.func import FuncOp, CallOp from air.dialects.scf import for_ as scf_range, yield_ from air.dialects import scf, affine, arith +from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu @module_builder @@ -1259,8 +1260,6 @@ def _emit_counter_increment(): print(mlir_module) exit(0) - from air.backend.xrt_runner import XRTRunner - from air.backend.xrt import XRTBackend from ml_dtypes import bfloat16 INPUT_DATATYPE = OUTPUT_DATATYPE = bfloat16 @@ -1305,37 +1304,19 @@ def _emit_counter_increment(): ) tiling = [1, 1, 1] if dv_chunks_host > 1 else [1, 1] - runner = XRTRunner( - omit_while_true_loop=False, - omit_pingpong="all", - verbose=args.verbose, - runtime_loop_tiling_sizes=tiling, - output_format=args.output_format, - instance_name="attention_bf16", - target_device="npu2", - ) - - if args.compile_mode == "compile-and-run": - exit( - runner.run_test( - mlir_module, - inputs=[input_q, input_k, input_v], - expected_outputs=[sdpa_output_transposed], - atol=0.15, - rtol=0.04, - max_mismatch_percentage=0.5, - min_correlation=0.99, - ) - ) - elif args.compile_mode == "compile-only": - backend = XRTBackend( - omit_while_true_loop=False, - omit_pingpong="all", - verbose=args.verbose, - runtime_loop_tiling_sizes=tiling, - output_format=args.output_format, + exit( + run_on_npu( + args, + mlir_module, + inputs=[input_q, input_k, input_v], instance_name="attention_bf16", + expected_outputs=[sdpa_output_transposed], + atol=0.15, + rtol=0.04, + max_mismatch_percentage=0.5, + min_correlation=0.99, + runtime_loop_tiling_sizes=tiling, + omit_pingpong="all", target_device="npu2", ) - module_function = backend.compile(mlir_module) - print("Compilation complete.") + ) diff --git a/programming_examples/gelu/gelu.py b/programming_examples/gelu/gelu.py index fe33c239b..2c0bfb80b 100644 --- a/programming_examples/gelu/gelu.py +++ b/programming_examples/gelu/gelu.py @@ -15,21 +15,24 @@ Computation is vectorized using vector.transfer_read/write. """ -import argparse +import os +import sys + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + import numpy as np from ml_dtypes import bfloat16 from air.ir import * -from air.dialects.affine import apply as affine_apply from air.dialects.air import * from air.dialects import arith, math as math_dialect from air.dialects.arith import ConstantOp -from air.dialects.memref import AllocOp, DeallocOp, subview -from air.dialects.vector import transfer_read, transfer_write, BroadcastOp +from air.dialects.memref import AllocOp, DeallocOp +from air.dialects.vector import BroadcastOp from air.dialects.func import FuncOp from air.dialects.scf import for_, yield_ -from air.backend.xrt_runner import XRTRunner, type_mapper -from air.backend.xrt import XRTBackend +from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu +from utils import vec_read, vec_write range_ = for_ @@ -47,14 +50,9 @@ def build_module(n, tile_n, np_dtype_in, vector_size=16): index_type = IndexType.get() l3memrefTy = MemRefType.get([n], xrt_dtype_in) - l1MemrefTy = MemRefType.get( - shape=[tile_n], - element_type=xrt_dtype_in, - memory_space=IntegerAttr.get(T.i32(), MemorySpace.L1), - ) - - vecTy = VectorType.get([VECTOR_SIZE], xrt_dtype_in) - identity_map = AffineMapAttr.get(AffineMap.get_identity(1)) + l1MemrefTy = l1_memref_type([tile_n], xrt_dtype_in) + vecTy = vec_type(VECTOR_SIZE, xrt_dtype_in) + imap = identity_map_attr() @FuncOp.from_py_func(l3memrefTy, l3memrefTy) def gelu(arg0, arg1): @@ -65,20 +63,7 @@ def herd_body(_tx, _ty, _sx, _sy, _l3_in, _l3_out): l1_out = AllocOp(l1MemrefTy, [], []) for _l_ivx in range_(0, n, tile_n * num_tiles): - offset_map = AffineMap.get( - 0, - 2, - [ - AffineExpr.get_add( - AffineSymbolExpr.get(0), - AffineExpr.get_mul( - AffineSymbolExpr.get(1), - AffineConstantExpr.get(tile_n), - ), - ) - ], - ) - offset = affine_apply(offset_map, [_l_ivx, _ty]) + offset = tile_offset_1d(_l_ivx, _ty, tile_n) dma_memcpy_nd( l1_in, @@ -102,10 +87,7 @@ def herd_body(_tx, _ty, _sx, _sy, _l3_in, _l3_out): v_s2opi = BroadcastOp(vecTy, s2opi_const) for j in range_(c0, cTileN, cVecSize): - sub_in = subview(l1_in.result, [j], [VECTOR_SIZE], [1]) - sub_out = subview(l1_out.result, [j], [VECTOR_SIZE], [1]) - - v_x = transfer_read(vecTy, sub_in, [c0], identity_map, cst0, [True]) + v_x = vec_read(l1_in, j, VECTOR_SIZE, c0, vecTy, cst0, imap) # GELU(x) = 0.5 * x * (1 + tanh(sqrt(2/pi) * (x + 0.044715 * x^3))) # Uses hardware tanh intrinsic — no exp or division needed. @@ -119,7 +101,7 @@ def herd_body(_tx, _ty, _sx, _sy, _l3_in, _l3_out): v_half_x = arith.mulf(v_x, v_half.result) v_gelu = arith.mulf(v_half_x, v_one_plus_tanh) - transfer_write(None, v_gelu, sub_out, [c0], identity_map, [True]) + vec_write(v_gelu, l1_out, j, VECTOR_SIZE, c0, imap) yield_([]) dma_memcpy_nd( @@ -139,31 +121,12 @@ def herd_body(_tx, _ty, _sx, _sy, _l3_in, _l3_out): TILE_N = 1024 INPUT_DATATYPE = bfloat16 - parser = argparse.ArgumentParser( - prog="run.py", - description="Builds, runs, and tests the GELU example", - ) - parser.add_argument("-v", "--verbose", action="store_true") - parser.add_argument("-p", "--print-module-only", action="store_true") + parser = make_air_parser("Builds, runs, and tests the GELU example") parser.add_argument("--n", type=int, default=N, help="Total number of elements") parser.add_argument("--tile-n", type=int, default=TILE_N, help="Tile size") parser.add_argument( "--vector-size", type=int, default=16, help="Vector size for SIMD operations" ) - parser.add_argument( - "--compile-mode", - type=str, - choices=["compile-only", "compile-and-run"], - dest="compile_mode", - default="compile-and-run", - ) - parser.add_argument( - "--output-format", - type=str, - choices=["xclbin", "elf"], - default="xclbin", - dest="output_format", - ) args = parser.parse_args() mlir_module = build_module(args.n, args.tile_n, INPUT_DATATYPE, args.vector_size) @@ -174,62 +137,42 @@ def herd_body(_tx, _ty, _sx, _sy, _l3_in, _l3_out): np.random.seed(0) input_a = np.random.uniform(-4.0, 4.0, args.n).astype(INPUT_DATATYPE) - if args.compile_mode == "compile-and-run": - num_samples = 100 - sampled_indices = np.vstack([np.random.randint(0, args.n, num_samples)]) - - # Match hardware bf16 computation: each op truncates to bf16 - def gelu_ref(x): - x_bf = INPUT_DATATYPE(x) - x2 = INPUT_DATATYPE(np.float32(x_bf) * np.float32(x_bf)) - x3 = INPUT_DATATYPE(np.float32(x_bf) * np.float32(x2)) - beta_x3 = INPUT_DATATYPE( - np.float32(x3) * np.float32(INPUT_DATATYPE(GELU_BETA)) - ) - inner = INPUT_DATATYPE(np.float32(x_bf) + np.float32(beta_x3)) - scaled = INPUT_DATATYPE( - np.float32(inner) * np.float32(INPUT_DATATYPE(SQRT_2_OVER_PI)) - ) - tanh_val = INPUT_DATATYPE(np.tanh(np.float32(scaled))) - one_plus_tanh = INPUT_DATATYPE( - np.float32(tanh_val) + np.float32(INPUT_DATATYPE(1.0)) - ) - half_x = INPUT_DATATYPE(np.float32(x_bf) * np.float32(INPUT_DATATYPE(0.5))) - return INPUT_DATATYPE(np.float32(half_x) * np.float32(one_plus_tanh)) - - sampled_values = np.array( - [gelu_ref(input_a[i]) for i in zip(*sampled_indices)], - dtype=INPUT_DATATYPE, - ) - sampled_data = { - "shape": (args.n,), - "indices": sampled_indices, - "values": sampled_values, - } - - runner = XRTRunner( - verbose=args.verbose, - omit_while_true_loop=False, - output_format=args.output_format, - instance_name="gelu", - runtime_loop_tiling_sizes=[4, 4], + # Match hardware bf16 computation: each op truncates to bf16 + def gelu_ref(x): + x_bf = INPUT_DATATYPE(x) + x2 = INPUT_DATATYPE(np.float32(x_bf) * np.float32(x_bf)) + x3 = INPUT_DATATYPE(np.float32(x_bf) * np.float32(x2)) + beta_x3 = INPUT_DATATYPE(np.float32(x3) * np.float32(INPUT_DATATYPE(GELU_BETA))) + inner = INPUT_DATATYPE(np.float32(x_bf) + np.float32(beta_x3)) + scaled = INPUT_DATATYPE( + np.float32(inner) * np.float32(INPUT_DATATYPE(SQRT_2_OVER_PI)) ) - exit( - runner.run_test( - mlir_module, - inputs=[input_a], - stochastic_expected_outputs=[sampled_data], - rtol=1e-1, - atol=5e-2, - ) + tanh_val = INPUT_DATATYPE(np.tanh(np.float32(scaled))) + one_plus_tanh = INPUT_DATATYPE( + np.float32(tanh_val) + np.float32(INPUT_DATATYPE(1.0)) ) + half_x = INPUT_DATATYPE(np.float32(x_bf) * np.float32(INPUT_DATATYPE(0.5))) + return INPUT_DATATYPE(np.float32(half_x) * np.float32(one_plus_tanh)) - elif args.compile_mode == "compile-only": - backend = XRTBackend( - verbose=args.verbose, - omit_while_true_loop=False, - output_format=args.output_format, - runtime_loop_tiling_sizes=[4, 4], + sampled_indices = np.vstack([np.random.randint(0, args.n, 100)]) + sampled_values = np.array( + [gelu_ref(input_a[i]) for i in zip(*sampled_indices)], + dtype=INPUT_DATATYPE, + ) + sampled_data = { + "shape": (args.n,), + "indices": sampled_indices, + "values": sampled_values, + } + + exit( + run_on_npu( + args, + mlir_module, + inputs=[input_a], + instance_name="gelu", + stochastic_expected_outputs=[sampled_data], + rtol=1e-1, + atol=5e-2, ) - module_function = backend.compile(mlir_module) - backend.unload() + ) diff --git a/programming_examples/herd_dataflow/run.py b/programming_examples/herd_dataflow/run.py index 177813e35..5dbaeb504 100644 --- a/programming_examples/herd_dataflow/run.py +++ b/programming_examples/herd_dataflow/run.py @@ -20,7 +20,7 @@ from air.dialects.scf import for_, yield_ from air.dialects.arith import ConstantOp from air.dialects.memref import AllocOp, DeallocOp, load, store, subview -from air.backend.xrt_runner import XRTRunner +from air.backend.xrt_runner import run_on_npu from ml_dtypes import bfloat16 # Constants for buffer sizes and loop bounds @@ -69,6 +69,13 @@ def parse_args(): dest="output_format", help="Output format for the compiled binary (default: xclbin)", ) + parser.add_argument( + "--compile-mode", + type=str, + choices=["compile-only", "compile-and-run"], + dest="compile_mode", + default="compile-and-run", + ) args = parser.parse_args() return args @@ -456,20 +463,15 @@ def main(): B = np.random.rand(M_SIZE, N_SIZE).astype(bfloat16) C = (A + B + 3.0).astype(bfloat16) - # Run the module using XRTRunner - runner = XRTRunner( - omit_while_true_loop=False, - verbose=False, - runtime_loop_tiling_sizes=[2, 2], - output_format=args.output_format, - instance_name="func1", - ) exit( - runner.run_test( + run_on_npu( + args, mlir_module, inputs=[A, B], + instance_name="func1", expected_outputs=[C], rtol=1e-2, + runtime_loop_tiling_sizes=[2, 2], ) ) diff --git a/programming_examples/layer_norm/layer_norm.py b/programming_examples/layer_norm/layer_norm.py index 93ddeb010..a60d9e773 100644 --- a/programming_examples/layer_norm/layer_norm.py +++ b/programming_examples/layer_norm/layer_norm.py @@ -14,7 +14,7 @@ configurable VECTOR_SIZE (default 16 for AIE2). """ -import argparse +import numpy as np from ml_dtypes import bfloat16 from air.ir import * @@ -29,8 +29,7 @@ ) from air.dialects.func import FuncOp from air.dialects.scf import for_, yield_ -from air.backend.xrt_runner import XRTRunner, type_mapper -from air.backend.xrt import XRTBackend +from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu range_ = for_ @@ -44,16 +43,15 @@ def build_module(M, N, np_dtype, vector_size=16): N % vector_size == 0 ), f"N ({N}) must be divisible by vector_size ({vector_size})" - vecTy = VectorType.get([vector_size], xrt_dtype) - identity_map = AffineMapAttr.get(AffineMap.get_identity(1)) + vecTy = vec_type(vector_size, xrt_dtype) + identity_map = identity_map_attr() # L3 types l3MemrefTy = MemRefType.get([M, N], xrt_dtype) # L1 types - l1_mem_space = IntegerAttr.get(T.i32(), MemorySpace.L1) - l1RowTy = MemRefType.get([N], xrt_dtype, memory_space=l1_mem_space) - l1VecTy = MemRefType.get([vector_size], xrt_dtype, memory_space=l1_mem_space) + l1RowTy = l1_memref_type([N], xrt_dtype) + l1VecTy = l1_memref_type([vector_size], xrt_dtype) @FuncOp.from_py_func(l3MemrefTy, l3MemrefTy) def layer_norm(arg0, arg1): @@ -177,12 +175,7 @@ def herd_body(_tx, _ty, _sx, _sy, l3_in, l3_out): VECTOR_SIZE = 16 INPUT_DATATYPE = bfloat16 - parser = argparse.ArgumentParser( - prog="run.py", - description="Builds, runs, and tests the layer normalization example", - ) - parser.add_argument("-v", "--verbose", action="store_true") - parser.add_argument("-p", "--print-module-only", action="store_true") + parser = make_air_parser("Builds, runs, and tests the layer normalization example") parser.add_argument("--M", type=int, default=M_DEFAULT, help="M dimension (rows)") parser.add_argument("--N", type=int, default=N_DEFAULT, help="N dimension (cols)") parser.add_argument( @@ -191,20 +184,6 @@ def herd_body(_tx, _ty, _sx, _sy, l3_in, l3_out): default=VECTOR_SIZE, help="Vector size for SIMD operations", ) - parser.add_argument( - "--compile-mode", - type=str, - choices=["compile-only", "compile-and-run"], - dest="compile_mode", - default="compile-and-run", - ) - parser.add_argument( - "--output-format", - type=str, - choices=["xclbin", "elf"], - default="xclbin", - dest="output_format", - ) args = parser.parse_args() mlir_module = build_module(args.M, args.N, INPUT_DATATYPE, args.vector_size) @@ -222,30 +201,14 @@ def herd_body(_tx, _ty, _sx, _sy, l3_in, l3_out): rstd = 1.0 / np.sqrt(variance + eps) y_expected = ((x_input - mean) * rstd).astype(INPUT_DATATYPE) - if args.compile_mode == "compile-and-run": - runner = XRTRunner( - verbose=args.verbose, - omit_while_true_loop=False, - output_format=args.output_format, + exit( + run_on_npu( + args, + mlir_module, + inputs=[x_input], instance_name="layer_norm", - runtime_loop_tiling_sizes=[4, 4], + expected_outputs=[y_expected], + rtol=5e-2, + atol=5e-1, ) - exit( - runner.run_test( - mlir_module, - inputs=[x_input], - expected_outputs=[y_expected], - rtol=5e-2, - atol=5e-1, - ) - ) - - elif args.compile_mode == "compile-only": - backend = XRTBackend( - verbose=args.verbose, - omit_while_true_loop=False, - output_format=args.output_format, - runtime_loop_tiling_sizes=[4, 4], - ) - module_function = backend.compile(mlir_module) - backend.unload() + ) diff --git a/programming_examples/leaky_relu/leaky_relu.py b/programming_examples/leaky_relu/leaky_relu.py index 46fd4dadf..ca10f52bc 100644 --- a/programming_examples/leaky_relu/leaky_relu.py +++ b/programming_examples/leaky_relu/leaky_relu.py @@ -11,23 +11,26 @@ configurable VECTOR_SIZE (default 16). """ -import argparse +import os +import sys + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + import numpy as np np.random.seed(42) from ml_dtypes import bfloat16 from air.ir import * -from air.dialects.affine import apply as affine_apply from air.dialects.air import * from air.dialects import arith from air.dialects.arith import ConstantOp -from air.dialects.memref import AllocOp, DeallocOp, subview -from air.dialects.vector import transfer_read, transfer_write, BroadcastOp +from air.dialects.memref import AllocOp, DeallocOp +from air.dialects.vector import BroadcastOp from air.dialects.func import FuncOp from air.dialects.scf import for_, yield_ -from air.backend.xrt_runner import XRTRunner, type_mapper -from air.backend.xrt import XRTBackend +from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu +from utils import vec_read, vec_write range_ = for_ @@ -41,18 +44,10 @@ def build_module(n, tile_n, np_dtype_in, alpha=0.01, vector_size=16): VECTOR_SIZE = vector_size index_type = IndexType.get() - # L3 MemRefTypes l3memrefTy = MemRefType.get([n], xrt_dtype_in) - - # L1 MemRefTypes - l1MemrefTy = MemRefType.get( - shape=[tile_n], - element_type=xrt_dtype_in, - memory_space=IntegerAttr.get(T.i32(), MemorySpace.L1), - ) - - vecTy = VectorType.get([VECTOR_SIZE], xrt_dtype_in) - identity_map = AffineMapAttr.get(AffineMap.get_identity(1)) + l1MemrefTy = l1_memref_type([tile_n], xrt_dtype_in) + vecTy = vec_type(VECTOR_SIZE, xrt_dtype_in) + imap = identity_map_attr() @FuncOp.from_py_func(l3memrefTy, l3memrefTy) def leaky_relu(arg0, arg1): @@ -73,21 +68,7 @@ def herd_body( l1_out_data = AllocOp(l1MemrefTy, [], []) for _l_ivx in range_(0, n, tile_n * num_tiles): - - offset_map = AffineMap.get( - 0, - 2, - [ - AffineExpr.get_add( - AffineSymbolExpr.get(0), - AffineExpr.get_mul( - AffineSymbolExpr.get(1), - AffineConstantExpr.get(tile_n), - ), - ) - ], - ) - offset = affine_apply(offset_map, [_l_ivx, _ty]) + offset = tile_offset_1d(_l_ivx, _ty, tile_n) dma_memcpy_nd( l1_in_data, @@ -107,24 +88,12 @@ def herd_body( v_alpha = BroadcastOp(vecTy, alpha_const) for j in range_(c0, cTileN, cVecSize): - sub_in = subview( - l1_in_data.result, - [j], - [VECTOR_SIZE], - [1], - ) - sub_out = subview( - l1_out_data.result, - [j], - [VECTOR_SIZE], - [1], - ) - v_x = transfer_read(vecTy, sub_in, [c0], identity_map, cst0, [True]) + v_x = vec_read(l1_in_data, j, VECTOR_SIZE, c0, vecTy, cst0, imap) v_alpha_x = arith.MulFOp(v_x, v_alpha) # Leaky RELU: x >= 0 ? x : alpha*x cmp = arith.CmpFOp(arith.CmpFPredicate.OGE, v_x, v_zero) v_result = arith.SelectOp(cmp, v_x, v_alpha_x) - transfer_write(None, v_result, sub_out, [c0], identity_map, [True]) + vec_write(v_result, l1_out_data, j, VECTOR_SIZE, c0, imap) yield_([]) dma_memcpy_nd( @@ -146,12 +115,7 @@ def herd_body( INPUT_DATATYPE = bfloat16 ALPHA = 0.01 - parser = argparse.ArgumentParser( - prog="run.py", - description="Builds, runs, and tests the Leaky RELU example", - ) - parser.add_argument("-v", "--verbose", action="store_true") - parser.add_argument("-p", "--print-module-only", action="store_true") + parser = make_air_parser("Builds, runs, and tests the Leaky RELU example") parser.add_argument("--n", type=int, default=N, help="Total number of elements") parser.add_argument("--tile-n", type=int, default=TILE_N, help="Tile size") parser.add_argument( @@ -163,20 +127,6 @@ def herd_body( default=16, help="Vector size for SIMD operations", ) - parser.add_argument( - "--compile-mode", - type=str, - choices=["compile-only", "compile-and-run"], - dest="compile_mode", - default="compile-and-run", - ) - parser.add_argument( - "--output-format", - type=str, - choices=["xclbin", "elf"], - default="xclbin", - dest="output_format", - ) args = parser.parse_args() @@ -190,44 +140,27 @@ def herd_body( # Mix of positive and negative values for Leaky RELU testing input_a = np.random.randn(args.n).astype(INPUT_DATATYPE) - if args.compile_mode == "compile-and-run": - num_samples = 100 - sampled_indices = np.vstack([np.random.randint(0, args.n, num_samples)]) - sampled_values = np.array( - [ - np.where(input_a[i] >= 0, input_a[i], args.alpha * input_a[i]) - for i in zip(*sampled_indices) - ], - dtype=INPUT_DATATYPE, - ) - sampled_data = { - "shape": (args.n,), - "indices": sampled_indices, - "values": sampled_values, - } - - runner = XRTRunner( - verbose=args.verbose, - omit_while_true_loop=False, - output_format=args.output_format, + sampled_indices = np.vstack([np.random.randint(0, args.n, 100)]) + sampled_values = np.array( + [ + np.where(input_a[i] >= 0, input_a[i], args.alpha * input_a[i]) + for i in zip(*sampled_indices) + ], + dtype=INPUT_DATATYPE, + ) + sampled_data = { + "shape": (args.n,), + "indices": sampled_indices, + "values": sampled_values, + } + + exit( + run_on_npu( + args, + mlir_module, + inputs=[input_a], instance_name="leaky_relu", - runtime_loop_tiling_sizes=[4, 4], + stochastic_expected_outputs=[sampled_data], + rtol=1e-2, ) - exit( - runner.run_test( - mlir_module, - inputs=[input_a], - stochastic_expected_outputs=[sampled_data], - rtol=1e-2, - ) - ) - - elif args.compile_mode == "compile-only": - backend = XRTBackend( - verbose=args.verbose, - omit_while_true_loop=False, - output_format=args.output_format, - runtime_loop_tiling_sizes=[4, 4], - ) - module_function = backend.compile(mlir_module) - backend.unload() + ) diff --git a/programming_examples/lit.cfg.py b/programming_examples/lit.cfg.py index 7a7f86ec9..e3bf7e3ca 100644 --- a/programming_examples/lit.cfg.py +++ b/programming_examples/lit.cfg.py @@ -85,8 +85,8 @@ ) result = result.stdout.decode("utf-8").split("\n") # Older format is "|[0000:41:00.1] ||RyzenAI-npu1 |" - # Newer format is "|[0000:41:00.1] |NPU Phoenix |" - p = re.compile(r"[\|]?(\[.+:.+:.+\]).+\|(RyzenAI-(npu\d)|NPU (\w+))\W*\|") + # Newer format is "|[0000:41:00.1] |NPU Phoenix |" or "|[0000:c6:00.1] |NPU Strix Halo |" + p = re.compile(r"[\|]?(\[.+:.+:.+\]).+\|(RyzenAI-(npu\d)|NPU ([\w ]+?))\s*\|") for l in result: m = p.match(l) if not m: @@ -94,9 +94,9 @@ print("Found Ryzen AI device:", m.group(1)) model = "unknown" if m.group(3): - model = str(m.group(3)) + model = str(m.group(3)).strip() if m.group(4): - model = str(m.group(4)) + model = str(m.group(4)).strip() print(f"\tmodel: '{model}'") config.available_features.add("ryzen_ai") run_on_npu = ( @@ -106,10 +106,10 @@ run_on_npu1 = run_on_npu config.available_features.add("ryzen_ai_npu1") print("Running tests on NPU1 with command line: ", run_on_npu) - elif model in ["npu4", "Strix"]: + elif "Strix" in model or model in ["npu4"]: run_on_npu2 = run_on_npu config.available_features.add("ryzen_ai_npu2") - print("Running tests on NPU4 with command line: ", run_on_2npu) + print("Running tests on NPU2 with command line: ", run_on_npu) else: print(f"WARNING: xrt-smi reported unknown NPU model '{model}'.") break diff --git a/programming_examples/llama2_mha/mha.py b/programming_examples/llama2_mha/mha.py index 3f2fa86c1..7dd406512 100644 --- a/programming_examples/llama2_mha/mha.py +++ b/programming_examples/llama2_mha/mha.py @@ -10,8 +10,7 @@ from air.dialects.memref import AllocOp, DeallocOp, load, store from air.dialects.func import FuncOp, CallOp from air.dialects.scf import for_, yield_ -from air.backend.xrt_runner import XRTRunner, type_mapper -from air.backend.xrt import XRTBackend +from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu from ml_dtypes import bfloat16 range_ = for_ @@ -867,37 +866,16 @@ def herd_body_0(_tx, _ty, _sx, _sy, c_data, out_data, pos): for i in range(0, args.n): output_xb[i] += softmax_output[t] * output_vc[t][i] - if args.compile_mode == "compile-and-run": - ###### Compile and test - runner = XRTRunner( - verbose=args.verbose, - omit_while_true_loop=False, - omit_pingpong=True, - output_format=args.output_format, - instance_name="mha_bf16", + instance_name = args.instance_name if args.instance_name else "mha_bf16" + exit( + run_on_npu( + args, + mlir_module, + inputs=[input_a, input_b, output_kc, output_vc], + instance_name=instance_name, + expected_outputs=[output_xb], + rtol=1e0, runtime_loop_tiling_sizes=[4, 4], - ) - exit( - runner.run_test( - mlir_module, - inputs=[input_a, input_b, output_kc, output_vc], - expected_outputs=[output_xb], - rtol=1e0, - ) - ) - - elif args.compile_mode == "compile-only": - ####### Compile only - backend = XRTBackend( - verbose=args.verbose, - omit_while_true_loop=False, omit_pingpong=True, - kernel_name=args.kernel_name, - instance_name=args.instance_name, - kernel_id=args.kernel_id, - output_format=args.output_format, - runtime_loop_tiling_sizes=[4, 4], ) - module_function = backend.compile(mlir_module) - - backend.unload() + ) diff --git a/programming_examples/matrix_multiplication/bf16/run.py b/programming_examples/matrix_multiplication/bf16/run.py index 1d1475d8e..4b2926a11 100644 --- a/programming_examples/matrix_multiplication/bf16/run.py +++ b/programming_examples/matrix_multiplication/bf16/run.py @@ -13,8 +13,7 @@ from air.dialects.memref import AllocOp, DeallocOp, load, store, subview from air.dialects.func import FuncOp from air.dialects.scf import for_, yield_ -from air.backend.xrt_runner import XRTRunner, type_mapper -from air.backend.xrt import XRTBackend +from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu from air.extras import types as extrasT from air.dialects.linalg.opdsl.lang import * import air.dialects.linalg.opdsl.lang as linalg_lang @@ -716,8 +715,13 @@ def herd_body( input_a = (np.random.randn(args.m, args.k) * 4).astype(INPUT_DATATYPE) input_b = (np.random.rand(args.k, args.n) * 4).astype(INPUT_DATATYPE) + # Build common compile kwargs + compile_kwargs = {"runtime_loop_tiling_sizes": [2, 2]} + if not args.direct_codegen: + compile_kwargs["lower_linalg_to_func"] = "mm.o" + if args.compile_mode == "compile-and-run": - # Stochastically sample num_sample results, and pass to XRTRunner backend for verification. + # Stochastically sample num_sample results, and pass to run_on_npu for verification. num_samples = 100 sampled_indices = np.vstack( [ @@ -747,64 +751,35 @@ def herd_body( "values": sampled_values, } - ###### Compile and test - runner_kwargs = { - "verbose": args.verbose, - "omit_while_true_loop": False, - "runtime_loop_tiling_sizes": [2, 2], - } - # Only use external kernel library if NOT in direct codegen mode - if not args.direct_codegen: - runner_kwargs["lower_linalg_to_func"] = "mm.o" - - runner = XRTRunner(**runner_kwargs, instance_name="matmul_bf16") exit( - runner.run_test( + run_on_npu( + args, mlir_module, inputs=[input_a, input_b], + instance_name="matmul_bf16", stochastic_expected_outputs=[sampled_data], rtol=0.05, atol=4, max_mismatch_percentage=5, + **compile_kwargs, ) ) - elif args.compile_mode == "compile-and-xclbin": - ###### Compile and generate xclbin (requires XRT, no execution) - backend_kwargs = { - "verbose": args.verbose, - "omit_while_true_loop": False, - "runtime_loop_tiling_sizes": [2, 2], - } - # Only use external kernel library if NOT in direct codegen mode - if not args.direct_codegen: - backend_kwargs["lower_linalg_to_func"] = "mm.o" - - backend = XRTBackend(**backend_kwargs) - module_function = backend.compile(mlir_module) - - backend.unload() - - elif args.compile_mode == "compile-only": - ###### Compile only (without XRT dependencies) - # Map architecture to target device - target_device = "npu2" if args.arch == "aie2p" else "npu1" - - backend_kwargs = { - "verbose": args.verbose, - "target_device": target_device, # Explicit target based on arch (no xrt dependencies) - "output_format": "none", # Skip xclbin generation (no xrt dependencies) - "omit_while_true_loop": False, - "runtime_loop_tiling_sizes": [2, 2], - } - # Only use external kernel library if NOT in direct codegen mode - if not args.direct_codegen: - backend_kwargs["lower_linalg_to_func"] = "mm.o" - - backend = XRTBackend(**backend_kwargs) - module_function = backend.compile(mlir_module) - - backend.unload() - - print("Compilation completed successfully!") - sys.exit(0) + elif args.compile_mode in ("compile-and-xclbin", "compile-only"): + ###### Compile only (no execution) + if args.compile_mode == "compile-only": + # Skip xclbin generation (no xrt dependencies) + target_device = "npu2" if args.arch == "aie2p" else "npu1" + compile_kwargs["target_device"] = target_device + compile_kwargs["output_format"] = "none" + # Remap to compile-only so run_on_npu dispatches correctly + args.compile_mode = "compile-only" + exit( + run_on_npu( + args, + mlir_module, + inputs=[], + instance_name="matmul_bf16", + **compile_kwargs, + ) + ) diff --git a/programming_examples/matrix_multiplication/i16/run.py b/programming_examples/matrix_multiplication/i16/run.py index 83567e8df..2f83a2810 100644 --- a/programming_examples/matrix_multiplication/i16/run.py +++ b/programming_examples/matrix_multiplication/i16/run.py @@ -12,8 +12,7 @@ from air.dialects.memref import AllocOp, DeallocOp, load, store, subview from air.dialects.func import FuncOp from air.dialects.scf import for_, yield_ -from air.backend.xrt_runner import XRTRunner, type_mapper -from air.backend.xrt import XRTBackend +from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu from air.extras import types as extrasT from air.dialects.linalg.opdsl.lang import * import air.dialects.linalg.opdsl.lang as linalg_lang @@ -710,9 +709,14 @@ def herd_body( input_b = np.arange(0, args.k * args.n, dtype=np.int64).reshape(args.k, args.n) % 7 input_b = input_b.astype(INPUT_DATATYPE) + # Build common compile kwargs + compile_kwargs = {"runtime_loop_tiling_sizes": [2, 2]} + if not args.direct_codegen: + compile_kwargs["lower_linalg_to_func"] = "mm.o" + if args.compile_mode == "compile-and-run": - # Stochastically sample num_sample results, and pass to XRTRunner backend for verification. + # Stochastically sample num_sample results, and pass to run_on_npu for verification. num_samples = 100 sampled_indices = np.vstack( [ @@ -743,61 +747,32 @@ def herd_body( "values": sampled_values, } - ###### Compile and test - runner_kwargs = { - "verbose": args.verbose, - "omit_while_true_loop": False, - "runtime_loop_tiling_sizes": [2, 2], - } - # Only use external kernel library if NOT in direct codegen mode - if not args.direct_codegen: - runner_kwargs["lower_linalg_to_func"] = "mm.o" - - runner = XRTRunner(**runner_kwargs, instance_name="matmul_bf16") exit( - runner.run_test( + run_on_npu( + args, mlir_module, inputs=[input_a, input_b], + instance_name="matmul_bf16", stochastic_expected_outputs=[sampled_data], + **compile_kwargs, ) ) - elif args.compile_mode == "compile-and-xclbin": - ###### Compile and generate xclbin (requires XRT, no execution) - backend_kwargs = { - "verbose": args.verbose, - "omit_while_true_loop": False, - "runtime_loop_tiling_sizes": [2, 2], - } - # Only use external kernel library if NOT in direct codegen mode - if not args.direct_codegen: - backend_kwargs["lower_linalg_to_func"] = "mm.o" - - backend = XRTBackend(**backend_kwargs) - module_function = backend.compile(mlir_module) - - backend.unload() - - elif args.compile_mode == "compile-only": - ###### Compile only (without XRT dependencies) - # Map architecture to target device - target_device = "npu2" if args.arch == "aie2p" else "npu1" - - backend_kwargs = { - "verbose": args.verbose, - "target_device": target_device, # Explicit target based on arch (no xrt dependencies) - "output_format": "none", # Skip xclbin generation (no xrt dependencies) - "omit_while_true_loop": False, - "runtime_loop_tiling_sizes": [2, 2], - } - # Only use external kernel library if NOT in direct codegen mode - if not args.direct_codegen: - backend_kwargs["lower_linalg_to_func"] = "mm.o" - - backend = XRTBackend(**backend_kwargs) - module_function = backend.compile(mlir_module) - - backend.unload() - - print("Compilation completed successfully!") - sys.exit(0) + elif args.compile_mode in ("compile-and-xclbin", "compile-only"): + ###### Compile only (no execution) + if args.compile_mode == "compile-only": + # Skip xclbin generation (no xrt dependencies) + target_device = "npu2" if args.arch == "aie2p" else "npu1" + compile_kwargs["target_device"] = target_device + compile_kwargs["output_format"] = "none" + # Remap to compile-only so run_on_npu dispatches correctly + args.compile_mode = "compile-only" + exit( + run_on_npu( + args, + mlir_module, + inputs=[], + instance_name="matmul_bf16", + **compile_kwargs, + ) + ) diff --git a/programming_examples/matrix_multiplication/i8/run.py b/programming_examples/matrix_multiplication/i8/run.py index fd65144e1..fed6615de 100644 --- a/programming_examples/matrix_multiplication/i8/run.py +++ b/programming_examples/matrix_multiplication/i8/run.py @@ -12,8 +12,7 @@ from air.dialects.memref import AllocOp, DeallocOp, load, store, subview from air.dialects.func import FuncOp from air.dialects.scf import for_, yield_ -from air.backend.xrt_runner import XRTRunner, type_mapper -from air.backend.xrt import XRTBackend +from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu from air.extras import types as extrasT from air.dialects.linalg.opdsl.lang import * import air.dialects.linalg.opdsl.lang as linalg_lang @@ -710,9 +709,14 @@ def herd_body( input_b = np.arange(0, args.k * args.n, dtype=np.int64).reshape(args.k, args.n) % 7 input_b = input_b.astype(INPUT_DATATYPE) + # Build common compile kwargs + compile_kwargs = {"runtime_loop_tiling_sizes": [2, 2]} + if not args.direct_codegen: + compile_kwargs["lower_linalg_to_func"] = "mm.o" + if args.compile_mode == "compile-and-run": - # Stochastically sample num_sample results, and pass to XRTRunner backend for verification. + # Stochastically sample num_sample results, and pass to run_on_npu for verification. num_samples = 100 sampled_indices = np.vstack( [ @@ -743,61 +747,32 @@ def herd_body( "values": sampled_values, } - ###### Compile and test - runner_kwargs = { - "verbose": args.verbose, - "omit_while_true_loop": False, - "runtime_loop_tiling_sizes": [2, 2], - } - # Only use external kernel library if NOT in direct codegen mode - if not args.direct_codegen: - runner_kwargs["lower_linalg_to_func"] = "mm.o" - - runner = XRTRunner(**runner_kwargs, instance_name="matmul_bf16") exit( - runner.run_test( + run_on_npu( + args, mlir_module, inputs=[input_a, input_b], + instance_name="matmul_bf16", stochastic_expected_outputs=[sampled_data], + **compile_kwargs, ) ) - elif args.compile_mode == "compile-and-xclbin": - ###### Compile and generate xclbin (requires XRT, no execution) - backend_kwargs = { - "verbose": args.verbose, - "omit_while_true_loop": False, - "runtime_loop_tiling_sizes": [2, 2], - } - # Only use external kernel library if NOT in direct codegen mode - if not args.direct_codegen: - backend_kwargs["lower_linalg_to_func"] = "mm.o" - - backend = XRTBackend(**backend_kwargs) - module_function = backend.compile(mlir_module) - - backend.unload() - - elif args.compile_mode == "compile-only": - ###### Compile only (without XRT dependencies) - # Map architecture to target device - target_device = "npu2" if args.arch == "aie2p" else "npu1" - - backend_kwargs = { - "verbose": args.verbose, - "target_device": target_device, # Explicit target based on arch (no xrt dependencies) - "output_format": "none", # Skip xclbin generation (no xrt dependencies) - "omit_while_true_loop": False, - "runtime_loop_tiling_sizes": [2, 2], - } - # Only use external kernel library if NOT in direct codegen mode - if not args.direct_codegen: - backend_kwargs["lower_linalg_to_func"] = "mm.o" - - backend = XRTBackend(**backend_kwargs) - module_function = backend.compile(mlir_module) - - backend.unload() - - print("Compilation completed successfully!") - sys.exit(0) + elif args.compile_mode in ("compile-and-xclbin", "compile-only"): + ###### Compile only (no execution) + if args.compile_mode == "compile-only": + # Skip xclbin generation (no xrt dependencies) + target_device = "npu2" if args.arch == "aie2p" else "npu1" + compile_kwargs["target_device"] = target_device + compile_kwargs["output_format"] = "none" + # Remap to compile-only so run_on_npu dispatches correctly + args.compile_mode = "compile-only" + exit( + run_on_npu( + args, + mlir_module, + inputs=[], + instance_name="matmul_bf16", + **compile_kwargs, + ) + ) diff --git a/programming_examples/matrix_scalar_add/multi_core_channel/multi_core_channel.py b/programming_examples/matrix_scalar_add/multi_core_channel/multi_core_channel.py index 950f33a92..2ff7e1dfc 100644 --- a/programming_examples/matrix_scalar_add/multi_core_channel/multi_core_channel.py +++ b/programming_examples/matrix_scalar_add/multi_core_channel/multi_core_channel.py @@ -1,13 +1,11 @@ # Copyright (C) 2024, Advanced Micro Devices, Inc. # SPDX-License-Identifier: MIT -import argparse - from air.ir import * from air.dialects.air import * from air.dialects.memref import AllocOp, DeallocOp, load, store from air.dialects.func import FuncOp from air.dialects.scf import for_, yield_ -from air.backend.xrt_runner import XRTRunner, type_mapper +from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu range_ = for_ @@ -80,15 +78,8 @@ def segment_body(): @herd(name=format_name("xaddherd", h, w), sizes=[1, 1]) def herd_body(_tx, _ty, _sx, _sy): - # We want to store our data in L1 memory - mem_space = IntegerAttr.get(T.i32(), MemorySpace.L1) - # This is the type definition of the tile - tile_type = MemRefType.get( - shape=tile_size, - element_type=xrt_dtype, - memory_space=mem_space, - ) + tile_type = l1_memref_type(tile_size, xrt_dtype) # We must allocate a buffer of tile size for the input/output tile_in = AllocOp(tile_type, [], []) @@ -133,20 +124,7 @@ def herd_body(_tx, _ty, _sx, _sy): TILE_HEIGHT = 16 INOUT_DATATYPE = np.int32 - parser = argparse.ArgumentParser( - prog="run.py", - description="Builds, runs, and tests the passthrough_dma example", - ) - parser.add_argument( - "-v", - "--verbose", - action="store_true", - ) - parser.add_argument( - "-p", - "--print-module-only", - action="store_true", - ) + parser = make_air_parser("Builds, runs, and tests the passthrough_dma example") parser.add_argument( "--image-height", type=int, @@ -162,14 +140,6 @@ def herd_body(_tx, _ty, _sx, _sy): parser.add_argument( "--tile-width", type=int, default=TILE_WIDTH, help="Width of the tile data" ) - parser.add_argument( - "--output-format", - type=str, - choices=["xclbin", "elf"], - default="xclbin", - dest="output_format", - help="Output format for the compiled binary (default: xclbin)", - ) args = parser.parse_args() @@ -199,10 +169,11 @@ def herd_body(_tx, _ty, _sx, _sy): ) output_b[i, j] = input_a[i, j] + tile_num - runner = XRTRunner( - verbose=args.verbose, - output_format=args.output_format, + run_on_npu( + args, + mlir_module, + inputs=[input_a], + expected_outputs=[output_b], instance_name="copy", runtime_loop_tiling_sizes=[4, 4], ) - exit(runner.run_test(mlir_module, inputs=[input_a], expected_outputs=[output_b])) diff --git a/programming_examples/matrix_scalar_add/multi_core_dma/multi_core_dma.py b/programming_examples/matrix_scalar_add/multi_core_dma/multi_core_dma.py index b33a3c7a5..1930fd7d9 100644 --- a/programming_examples/matrix_scalar_add/multi_core_dma/multi_core_dma.py +++ b/programming_examples/matrix_scalar_add/multi_core_dma/multi_core_dma.py @@ -1,14 +1,11 @@ # Copyright (C) 2024, Advanced Micro Devices, Inc. # SPDX-License-Identifier: MIT -import argparse - from air.ir import * from air.dialects.air import * from air.dialects.memref import AllocOp, DeallocOp, load, store from air.dialects.func import FuncOp from air.dialects.scf import for_, yield_ -from air.dialects.affine import apply as affine_apply -from air.backend.xrt_runner import XRTRunner, type_mapper +from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu range_ = for_ @@ -43,62 +40,16 @@ def segment_body(arg2, arg3): operands=[arg2, arg3], ) def herd_body(tx, ty, _sx, _sy, a, b): - scaled_index_map_height = AffineMap.get( - 0, - 1, - [ - AffineExpr.get_mul( - AffineSymbolExpr.get(0), - AffineConstantExpr.get(tile_height), - ) - ], - ) - scaled_index_map_width = AffineMap.get( - 0, - 1, - [ - AffineExpr.get_mul( - AffineSymbolExpr.get(0), - AffineConstantExpr.get(tile_width), - ) - ], - ) - create_tile_index_height = AffineMap.get( - 0, - 1, - [ - AffineExpr.get_mul( - AffineSymbolExpr.get(0), - AffineConstantExpr.get(image_width // tile_width), - ) - ], - ) - create_tile_index = AffineMap.get( - 0, - 2, - [ - AffineExpr.get_add( - AffineSymbolExpr.get(0), - AffineSymbolExpr.get(1), - ) - ], - ) - offset0 = affine_apply(scaled_index_map_height, [tx]) - offset1 = affine_apply(scaled_index_map_width, [ty]) - tile_index_height = affine_apply(create_tile_index_height, [tx]) - compute_tile_id = affine_apply( - create_tile_index, [tile_index_height, ty] + offset0 = tile_offset_1d(tx, 0, tile_height) + offset1 = tile_offset_1d(ty, 0, tile_width) + tile_index_height = arith.muli( + tx, + arith.ConstantOp.create_index(image_width // tile_width), ) - - # We want to store our data in L1 memory - mem_space = IntegerAttr.get(T.i32(), MemorySpace.L1) + compute_tile_id = arith.addi(tile_index_height, ty) # This is the type definition of the tile - tile_type = MemRefType.get( - shape=tile_size, - element_type=T.i32(), - memory_space=mem_space, - ) + tile_type = l1_memref_type(tile_size, T.i32()) # We must allocate a buffer of tile size for the input/output tile_in = AllocOp(tile_type, [], []) @@ -151,20 +102,7 @@ def herd_body(tx, ty, _sx, _sy, a, b): TILE_HEIGHT = 16 INOUT_DATATYPE = np.int32 - parser = argparse.ArgumentParser( - prog="run.py", - description="Builds, runs, and tests the passthrough_dma example", - ) - parser.add_argument( - "-v", - "--verbose", - action="store_true", - ) - parser.add_argument( - "-p", - "--print-module-only", - action="store_true", - ) + parser = make_air_parser("Builds, runs, and tests the passthrough_dma example") parser.add_argument( "--image-height", type=int, @@ -180,14 +118,6 @@ def herd_body(tx, ty, _sx, _sy, a, b): parser.add_argument( "--tile-width", type=int, default=TILE_WIDTH, help="Width of the tile data" ) - parser.add_argument( - "--output-format", - type=str, - choices=["xclbin", "elf"], - default="xclbin", - dest="output_format", - help="Output format for the compiled binary (default: xclbin)", - ) args = parser.parse_args() @@ -217,10 +147,11 @@ def herd_body(tx, ty, _sx, _sy, a, b): ) output_b[i, j] = input_a[i, j] + tile_num - runner = XRTRunner( - verbose=args.verbose, - output_format=args.output_format, + run_on_npu( + args, + mlir_module, + inputs=[input_a], + expected_outputs=[output_b], instance_name="copy", runtime_loop_tiling_sizes=[4, 4], ) - exit(runner.run_test(mlir_module, inputs=[input_a], expected_outputs=[output_b])) diff --git a/programming_examples/matrix_scalar_add/multi_launch_channel/multi_launch_channel.py b/programming_examples/matrix_scalar_add/multi_launch_channel/multi_launch_channel.py index c70a2048d..712796dea 100644 --- a/programming_examples/matrix_scalar_add/multi_launch_channel/multi_launch_channel.py +++ b/programming_examples/matrix_scalar_add/multi_launch_channel/multi_launch_channel.py @@ -1,13 +1,11 @@ # Copyright (C) 2024, Advanced Micro Devices, Inc. # SPDX-License-Identifier: MIT -import argparse - from air.ir import * from air.dialects.air import * from air.dialects.memref import AllocOp, DeallocOp, load, store from air.dialects.func import FuncOp from air.dialects.scf import for_, yield_ -from air.backend.xrt_runner import XRTRunner, type_mapper +from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu range_ = for_ @@ -71,15 +69,8 @@ def segment_body(): @herd(name=format_name("xaddherd", h, w), sizes=[1, 1]) def herd_body(_tx, _ty, _sx, _sy): - # We want to store our data in L1 memory - mem_space = IntegerAttr.get(T.i32(), MemorySpace.L1) - # This is the type definition of the tile - tile_type = MemRefType.get( - shape=tile_size, - element_type=xrt_dtype, - memory_space=mem_space, - ) + tile_type = l1_memref_type(tile_size, xrt_dtype) # We must allocate a buffer of tile size for the input/output tile_in = AllocOp(tile_type, [], []) @@ -124,20 +115,7 @@ def herd_body(_tx, _ty, _sx, _sy): TILE_HEIGHT = 16 INOUT_DATATYPE = np.int32 - parser = argparse.ArgumentParser( - prog="run.py", - description="Builds, runs, and tests the passthrough_dma example", - ) - parser.add_argument( - "-v", - "--verbose", - action="store_true", - ) - parser.add_argument( - "-p", - "--print-module-only", - action="store_true", - ) + parser = make_air_parser("Builds, runs, and tests the passthrough_dma example") parser.add_argument( "--image-height", type=int, @@ -153,14 +131,6 @@ def herd_body(_tx, _ty, _sx, _sy): parser.add_argument( "--tile-width", type=int, default=TILE_WIDTH, help="Width of the tile data" ) - parser.add_argument( - "--output-format", - type=str, - choices=["xclbin", "elf"], - default="xclbin", - dest="output_format", - help="Output format for the compiled binary (default: xclbin)", - ) args = parser.parse_args() @@ -190,10 +160,11 @@ def herd_body(_tx, _ty, _sx, _sy): ) output_b[i, j] = input_a[i, j] + tile_num - runner = XRTRunner( - verbose=args.verbose, - output_format=args.output_format, + run_on_npu( + args, + mlir_module, + inputs=[input_a], + expected_outputs=[output_b], instance_name="copy", runtime_loop_tiling_sizes=[4, 4], ) - exit(runner.run_test(mlir_module, inputs=[input_a], expected_outputs=[output_b])) diff --git a/programming_examples/matrix_scalar_add/single_core_channel/single_core_channel.py b/programming_examples/matrix_scalar_add/single_core_channel/single_core_channel.py index 6c4c1c553..daad63ab8 100644 --- a/programming_examples/matrix_scalar_add/single_core_channel/single_core_channel.py +++ b/programming_examples/matrix_scalar_add/single_core_channel/single_core_channel.py @@ -1,13 +1,11 @@ # Copyright (C) 2024, Advanced Micro Devices, Inc. # SPDX-License-Identifier: MIT -import argparse - from air.ir import * from air.dialects.air import * from air.dialects.memref import AllocOp, DeallocOp, load, store from air.dialects.func import FuncOp from air.dialects.scf import for_, yield_ -from air.backend.xrt_runner import XRTRunner, type_mapper +from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu range_ = for_ @@ -71,15 +69,8 @@ def segment_body(): @herd(name="xaddherd", sizes=[1, 1]) def herd_body(_tx, _ty, _sx, _sy): - # We want to store our data in L1 memory - mem_space = IntegerAttr.get(T.i32(), MemorySpace.L1) - # This is the type definition of the tile - tile_type = MemRefType.get( - shape=tile_size, - element_type=xrt_dtype, - memory_space=mem_space, - ) + tile_type = l1_memref_type(tile_size, xrt_dtype) # Loop over columns and rows of tiles for tile_num in range_( @@ -127,20 +118,7 @@ def herd_body(_tx, _ty, _sx, _sy): TILE_HEIGHT = 16 INOUT_DATATYPE = np.int32 - parser = argparse.ArgumentParser( - prog="run.py", - description="Builds, runs, and tests the passthrough_dma example", - ) - parser.add_argument( - "-v", - "--verbose", - action="store_true", - ) - parser.add_argument( - "-p", - "--print-module-only", - action="store_true", - ) + parser = make_air_parser("Builds, runs, and tests the passthrough_dma example") parser.add_argument( "--image-height", type=int, @@ -156,14 +134,6 @@ def herd_body(_tx, _ty, _sx, _sy): parser.add_argument( "--tile-width", type=int, default=TILE_WIDTH, help="Width of the tile data" ) - parser.add_argument( - "--output-format", - type=str, - choices=["xclbin", "elf"], - default="xclbin", - dest="output_format", - help="Output format for the compiled binary (default: xclbin)", - ) args = parser.parse_args() @@ -193,10 +163,11 @@ def herd_body(_tx, _ty, _sx, _sy): ) output_b[i, j] = input_a[i, j] + tile_num - runner = XRTRunner( - verbose=args.verbose, - output_format=args.output_format, + run_on_npu( + args, + mlir_module, + inputs=[input_a], + expected_outputs=[output_b], instance_name="copy", runtime_loop_tiling_sizes=[4, 4], ) - exit(runner.run_test(mlir_module, inputs=[input_a], expected_outputs=[output_b])) diff --git a/programming_examples/matrix_scalar_add/single_core_dma/single_core_dma.py b/programming_examples/matrix_scalar_add/single_core_dma/single_core_dma.py index 7c72ba371..ca29409d4 100644 --- a/programming_examples/matrix_scalar_add/single_core_dma/single_core_dma.py +++ b/programming_examples/matrix_scalar_add/single_core_dma/single_core_dma.py @@ -1,13 +1,11 @@ # Copyright (C) 2024, Advanced Micro Devices, Inc. # SPDX-License-Identifier: MIT -import argparse - from air.ir import * from air.dialects.air import * from air.dialects.memref import AllocOp, DeallocOp, load, store from air.dialects.func import FuncOp from air.dialects.scf import for_, yield_ -from air.backend.xrt_runner import XRTRunner, type_mapper +from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu range_ = for_ @@ -38,15 +36,8 @@ def segment_body(arg2, arg3): # We just need one compute core, so we ask for a 1x1 herd @herd(name="xaddherd", sizes=[1, 1], operands=[arg2, arg3]) def herd_body(_tx, _ty, _sx, _sy, a, b): - # We want to store our data in L1 memory - mem_space = IntegerAttr.get(T.i32(), MemorySpace.L1) - # This is the type definition of the tile - tile_type = MemRefType.get( - shape=tile_size, - element_type=T.i32(), - memory_space=mem_space, - ) + tile_type = l1_memref_type(tile_size, T.i32()) # Loop over columns and rows of tiles for tile_index0 in range_(image_height // tile_height): @@ -121,20 +112,7 @@ def herd_body(_tx, _ty, _sx, _sy, a, b): TILE_HEIGHT = 16 INOUT_DATATYPE = np.int32 - parser = argparse.ArgumentParser( - prog="run.py", - description="Builds, runs, and tests the passthrough_dma example", - ) - parser.add_argument( - "-v", - "--verbose", - action="store_true", - ) - parser.add_argument( - "-p", - "--print-module-only", - action="store_true", - ) + parser = make_air_parser("Builds, runs, and tests the passthrough_dma example") parser.add_argument( "--image-height", type=int, @@ -150,14 +128,6 @@ def herd_body(_tx, _ty, _sx, _sy, a, b): parser.add_argument( "--tile-width", type=int, default=TILE_WIDTH, help="Width of the tile data" ) - parser.add_argument( - "--output-format", - type=str, - choices=["xclbin", "elf"], - default="xclbin", - dest="output_format", - help="Output format for the compiled binary (default: xclbin)", - ) args = parser.parse_args() @@ -187,10 +157,11 @@ def herd_body(_tx, _ty, _sx, _sy, a, b): ) output_b[i, j] = input_a[i, j] + tile_num - runner = XRTRunner( - verbose=args.verbose, - output_format=args.output_format, + run_on_npu( + args, + mlir_module, + inputs=[input_a], + expected_outputs=[output_b], instance_name="copy", runtime_loop_tiling_sizes=[4, 4], ) - exit(runner.run_test(mlir_module, inputs=[input_a], expected_outputs=[output_b])) diff --git a/programming_examples/matrix_vector_multiplication/bf16/matvec.py b/programming_examples/matrix_vector_multiplication/bf16/matvec.py index 30bfd232a..e4c6f05e1 100644 --- a/programming_examples/matrix_vector_multiplication/bf16/matvec.py +++ b/programming_examples/matrix_vector_multiplication/bf16/matvec.py @@ -25,8 +25,7 @@ from air.dialects.memref import AllocOp, DeallocOp from air.dialects.func import FuncOp, CallOp from air.dialects.scf import for_, yield_ -from air.backend.xrt_runner import XRTRunner, type_mapper -from air.backend.xrt import XRTBackend +from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu range_ = for_ @@ -376,28 +375,28 @@ def herd_body(_tx, _ty, _sx, _sy, _l1_c, _l2_c): input_a.astype(np.float32), input_b.astype(np.float32) ).astype(OUTPUT_DATATYPE) - runner = XRTRunner( - verbose=args.verbose, - omit_while_true_loop=False, - runtime_loop_tiling_sizes=[4, 4], - output_format=args.output_format, - instance_name="matvec_bf16", - ) exit( - runner.run_test( + run_on_npu( + args, mlir_module, inputs=[input_a, input_b], + instance_name="matvec_bf16", expected_outputs=[output_c], rtol=0.04, atol=1e-3, + runtime_loop_tiling_sizes=[4, 4], ) ) elif args.compile_mode == "compile-and-xclbin": - backend = XRTBackend( - verbose=args.verbose, - omit_while_true_loop=False, - runtime_loop_tiling_sizes=[4, 4], + # Remap to compile-only so run_on_npu dispatches correctly + args.compile_mode = "compile-only" + exit( + run_on_npu( + args, + mlir_module, + inputs=[], + instance_name="matvec_bf16", + runtime_loop_tiling_sizes=[4, 4], + ) ) - module_function = backend.compile(mlir_module) - backend.unload() diff --git a/programming_examples/mnist_fc/argmax/run.py b/programming_examples/mnist_fc/argmax/run.py index 4b9b396ad..7d33ce2dd 100644 --- a/programming_examples/mnist_fc/argmax/run.py +++ b/programming_examples/mnist_fc/argmax/run.py @@ -25,8 +25,7 @@ from air.dialects.memref import AllocOp, DeallocOp, subview, load, store from air.dialects.func import FuncOp from air.dialects.scf import for_, yield_ -from air.backend.xrt_runner import XRTRunner, type_mapper -from air.backend.xrt import XRTBackend +from air.backend.xrt_runner import type_mapper, run_on_npu from air.extras import types as extrasT np.random.seed(42) @@ -197,6 +196,13 @@ def herd_body(tx, ty, _sx, _sy, _loff_row, _l3_a, _l3_out): dest="compile_mode", default="compile-and-run", ) + parser.add_argument( + "--output-format", + type=str, + choices=["xclbin", "elf"], + default="xclbin", + dest="output_format", + ) args = parser.parse_args() @@ -258,29 +264,26 @@ def herd_body(tx, ty, _sx, _sy, _loff_row, _l3_a, _l3_out): "values": sampled_values, } - runner = XRTRunner( - verbose=args.verbose, - omit_while_true_loop=False, - output_format="xclbin", - instance_name="argmax", - runtime_loop_tiling_sizes=[4, 4], - ) exit( - runner.run_test( + run_on_npu( + args, mlir_module, inputs=[input_a], + instance_name="argmax", stochastic_expected_outputs=[sampled_data], rtol=0, atol=0, + runtime_loop_tiling_sizes=[4, 4], ) ) elif args.compile_mode == "compile-only": - backend = XRTBackend( - verbose=args.verbose, - omit_while_true_loop=False, - output_format="xclbin", - runtime_loop_tiling_sizes=[4, 4], + exit( + run_on_npu( + args, + mlir_module, + inputs=[], + instance_name="argmax", + runtime_loop_tiling_sizes=[4, 4], + ) ) - module_function = backend.compile(mlir_module) - backend.unload() diff --git a/programming_examples/mnist_fc/broadcast_bias_add/run.py b/programming_examples/mnist_fc/broadcast_bias_add/run.py index dfdba1b2c..56d59c149 100644 --- a/programming_examples/mnist_fc/broadcast_bias_add/run.py +++ b/programming_examples/mnist_fc/broadcast_bias_add/run.py @@ -28,8 +28,7 @@ from air.dialects.vector import transfer_read, transfer_write, BroadcastOp from air.dialects.func import FuncOp from air.dialects.scf import for_, yield_ -from air.backend.xrt_runner import XRTRunner, type_mapper -from air.backend.xrt import XRTBackend +from air.backend.xrt_runner import type_mapper, run_on_npu from air.extras import types as extrasT np.random.seed(42) @@ -311,6 +310,9 @@ def herd_body( input_bias = np.zeros(N_padded, dtype=np.float32) input_bias[:N_actual] = (np.random.randn(N_actual) * 2).astype(np.float32) + # Set output_format based on padding requirements + args.output_format = "elf" if needs_padding else "xclbin" + if args.compile_mode == "compile-and-run": # Golden: C[row,col] = A[row,col] + bias[col] num_samples = 100 @@ -355,28 +357,25 @@ def herd_body( "values": sampled_values, } - runner = XRTRunner( - verbose=args.verbose, - omit_while_true_loop=False, - output_format="elf" if needs_padding else "xclbin", - instance_name="broadcast_bias_add", - runtime_loop_tiling_sizes=[4, 4], - ) exit( - runner.run_test( + run_on_npu( + args, mlir_module, inputs=[input_a, input_bias], + instance_name="broadcast_bias_add", stochastic_expected_outputs=[sampled_data], rtol=1e-6, + runtime_loop_tiling_sizes=[4, 4], ) ) elif args.compile_mode == "compile-only": - backend = XRTBackend( - verbose=args.verbose, - omit_while_true_loop=False, - output_format="elf" if needs_padding else "xclbin", - runtime_loop_tiling_sizes=[4, 4], + exit( + run_on_npu( + args, + mlir_module, + inputs=[], + instance_name="broadcast_bias_add", + runtime_loop_tiling_sizes=[4, 4], + ) ) - module_function = backend.compile(mlir_module) - backend.unload() diff --git a/programming_examples/mnist_fc/integration/run.py b/programming_examples/mnist_fc/integration/run.py index b5673d8aa..92acc2421 100644 --- a/programming_examples/mnist_fc/integration/run.py +++ b/programming_examples/mnist_fc/integration/run.py @@ -35,8 +35,7 @@ from air.dialects.vector import transfer_read, transfer_write, BroadcastOp from air.dialects.func import FuncOp from air.dialects.scf import for_, yield_ -from air.backend.xrt_runner import XRTRunner, type_mapper -from air.backend.xrt import XRTBackend +from air.backend.xrt_runner import type_mapper, run_on_npu from air.compiler.util import run_transform from air.extras import types as extrasT @@ -696,6 +695,13 @@ def _extend_with_elementwise( dest="compile_mode", default="compile-and-run", ) + parser.add_argument( + "--output-format", + type=str, + choices=["xclbin", "elf"], + default="elf", + dest="output_format", + ) args = parser.parse_args() # aie2p mmul dimensions @@ -811,15 +817,9 @@ def _extend_with_elementwise( "values": golden_argmax_pad[sampled_cols], } - runner = XRTRunner( - verbose=args.verbose, - omit_while_true_loop=False, - output_format="elf", - instance_name="mnist_fc", - runtime_loop_tiling_sizes=[1, 1], - ) exit( - runner.run_test( + run_on_npu( + args, mlir_module, inputs=[ W1, @@ -831,18 +831,21 @@ def _extend_with_elementwise( bias2, bias2_out, ], + instance_name="mnist_fc", stochastic_expected_outputs=[sampled_data], rtol=0, atol=0, + runtime_loop_tiling_sizes=[1, 1], ) ) elif args.compile_mode == "compile-only": - backend = XRTBackend( - verbose=args.verbose, - omit_while_true_loop=False, - output_format="elf", - runtime_loop_tiling_sizes=[1, 1], + exit( + run_on_npu( + args, + mlir_module, + inputs=[], + instance_name="mnist_fc", + runtime_loop_tiling_sizes=[1, 1], + ) ) - module_function = backend.compile(mlir_module) - backend.unload() diff --git a/programming_examples/mnist_fc/relu/run.py b/programming_examples/mnist_fc/relu/run.py index 08ea4d01d..2cdcf2eb7 100644 --- a/programming_examples/mnist_fc/relu/run.py +++ b/programming_examples/mnist_fc/relu/run.py @@ -24,8 +24,7 @@ from air.dialects.vector import transfer_read, transfer_write, BroadcastOp from air.dialects.func import FuncOp from air.dialects.scf import for_, yield_ -from air.backend.xrt_runner import XRTRunner, type_mapper -from air.backend.xrt import XRTBackend +from air.backend.xrt_runner import type_mapper, run_on_npu from air.extras import types as extrasT np.random.seed(42) @@ -335,6 +334,9 @@ def herd_body(tx, ty, _sx, _sy, _loff_m, _loff_n, _l3_a, _l3_out): np.float32 ) + # Set output_format based on padding requirements + args.output_format = "elf" if needs_padding else "xclbin" + if args.compile_mode == "compile-and-run": # Golden reference: max(x, 0) num_samples = 100 @@ -381,29 +383,26 @@ def herd_body(tx, ty, _sx, _sy, _loff_m, _loff_n, _l3_a, _l3_out): "values": sampled_values, } - runner = XRTRunner( - verbose=args.verbose, - omit_while_true_loop=False, - output_format="elf" if needs_padding else "xclbin", - instance_name="relu", - runtime_loop_tiling_sizes=[4, 4], - ) # bf16 truncation introduces rounding; use bf16-appropriate tolerance exit( - runner.run_test( + run_on_npu( + args, mlir_module, inputs=[input_a], + instance_name="relu", stochastic_expected_outputs=[sampled_data], rtol=1e-2, + runtime_loop_tiling_sizes=[4, 4], ) ) elif args.compile_mode == "compile-only": - backend = XRTBackend( - verbose=args.verbose, - omit_while_true_loop=False, - output_format="elf" if needs_padding else "xclbin", - runtime_loop_tiling_sizes=[4, 4], + exit( + run_on_npu( + args, + mlir_module, + inputs=[], + instance_name="relu", + runtime_loop_tiling_sizes=[4, 4], + ) ) - module_function = backend.compile(mlir_module) - backend.unload() diff --git a/programming_examples/multi_segment/multi_segment_channel/multi_segment.py b/programming_examples/multi_segment/multi_segment_channel/multi_segment.py index af2eea2fe..8fae089ae 100644 --- a/programming_examples/multi_segment/multi_segment_channel/multi_segment.py +++ b/programming_examples/multi_segment/multi_segment_channel/multi_segment.py @@ -1,6 +1,5 @@ # Copyright (C) 2024, Advanced Micro Devices, Inc. # SPDX-License-Identifier: MIT -import argparse import numpy as np from air.ir import * @@ -8,7 +7,7 @@ from air.dialects.memref import AllocOp, DeallocOp, load, store from air.dialects.func import FuncOp from air.dialects.scf import for_, yield_ -from air.backend.xrt_runner import XRTRunner, type_mapper +from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu range_ = for_ @@ -21,15 +20,8 @@ def build_module(): xrt_dtype = type_mapper(INOUT_DATATYPE) memrefTyInOut = T.memref(VECTOR_LEN, xrt_dtype) - # We want to store our data in L1 memory - mem_space_l1 = IntegerAttr.get(T.i32(), MemorySpace.L1) - # This is the type definition of the tile - image_type_l1 = MemRefType.get( - shape=[VECTOR_LEN], - element_type=xrt_dtype, - memory_space=mem_space_l1, - ) + image_type_l1 = l1_memref_type([VECTOR_LEN], xrt_dtype) Channel("ChanInA") Channel("ChanInB") @@ -94,27 +86,8 @@ def herd_body(tx, ty, sx, sy): if __name__ == "__main__": - parser = argparse.ArgumentParser( - prog="run.py", - description="Builds, runs, and tests the multi segment channel example", - ) - parser.add_argument( - "-v", - "--verbose", - action="store_true", - ) - parser.add_argument( - "-p", - "--print-module-only", - action="store_true", - ) - parser.add_argument( - "--output-format", - type=str, - choices=["xclbin", "elf"], - default="xclbin", - dest="output_format", - help="Output format for the compiled binary (default: xclbin)", + parser = make_air_parser( + "Builds, runs, and tests the multi segment channel example" ) args = parser.parse_args() @@ -129,16 +102,12 @@ def herd_body(tx, ty, sx, sy): output_c = np.full(VECTOR_LEN, 5, dtype=INOUT_DATATYPE) output_d = np.full(VECTOR_LEN, 13, dtype=INOUT_DATATYPE) - runner = XRTRunner( - verbose=args.verbose, - output_format=args.output_format, - instance_name="copy", - runtime_loop_tiling_sizes=[4, 4], - ) exit( - runner.run_test( + run_on_npu( + args, mlir_module, inputs=[input_a, input_b], + instance_name="copy", expected_outputs=[output_c, output_d], ) ) diff --git a/programming_examples/multi_segment/multi_segment_dma/multi_segment.py b/programming_examples/multi_segment/multi_segment_dma/multi_segment.py index 058a77576..f8f33cbe3 100644 --- a/programming_examples/multi_segment/multi_segment_dma/multi_segment.py +++ b/programming_examples/multi_segment/multi_segment_dma/multi_segment.py @@ -1,6 +1,5 @@ # Copyright (C) 2024, Advanced Micro Devices, Inc. # SPDX-License-Identifier: MIT -import argparse import numpy as np from air.ir import * @@ -8,7 +7,7 @@ from air.dialects.memref import AllocOp, DeallocOp, load, store from air.dialects.func import FuncOp from air.dialects.scf import for_, yield_ -from air.backend.xrt_runner import XRTRunner, type_mapper +from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu range_ = for_ @@ -21,15 +20,8 @@ def build_module(): xrt_dtype = type_mapper(INOUT_DATATYPE) memrefTyInOut = T.memref(VECTOR_LEN, xrt_dtype) - # We want to store our data in L1 memory - mem_space_l1 = IntegerAttr.get(T.i32(), MemorySpace.L1) - # This is the type definition of the tile - image_type_l1 = MemRefType.get( - shape=[VECTOR_LEN], - element_type=xrt_dtype, - memory_space=mem_space_l1, - ) + image_type_l1 = l1_memref_type([VECTOR_LEN], xrt_dtype) # We will send an image worth of data in and out @FuncOp.from_py_func(memrefTyInOut, memrefTyInOut, memrefTyInOut, memrefTyInOut) @@ -86,28 +78,7 @@ def herd_body(tx, ty, sx, sy, b, d): if __name__ == "__main__": - parser = argparse.ArgumentParser( - prog="run.py", - description="Builds, runs, and tests the multi segment dma example", - ) - parser.add_argument( - "-v", - "--verbose", - action="store_true", - ) - parser.add_argument( - "-p", - "--print-module-only", - action="store_true", - ) - parser.add_argument( - "--output-format", - type=str, - choices=["xclbin", "elf"], - default="xclbin", - dest="output_format", - help="Output format for the compiled binary (default: xclbin)", - ) + parser = make_air_parser("Builds, runs, and tests the multi segment dma example") args = parser.parse_args() @@ -121,16 +92,12 @@ def herd_body(tx, ty, sx, sy, b, d): output_c = np.full(VECTOR_LEN, 5, dtype=INOUT_DATATYPE) output_d = np.full(VECTOR_LEN, 13, dtype=INOUT_DATATYPE) - runner = XRTRunner( - verbose=args.verbose, - output_format=args.output_format, - instance_name="copy", - runtime_loop_tiling_sizes=[4, 4], - ) exit( - runner.run_test( + run_on_npu( + args, mlir_module, inputs=[input_a, input_b], + instance_name="copy", expected_outputs=[output_c, output_d], ) ) diff --git a/programming_examples/passthrough/passthrough_channel/passthrough_channel.py b/programming_examples/passthrough/passthrough_channel/passthrough_channel.py index b4c4c52c1..7c43b645c 100644 --- a/programming_examples/passthrough/passthrough_channel/passthrough_channel.py +++ b/programming_examples/passthrough/passthrough_channel/passthrough_channel.py @@ -1,6 +1,5 @@ # Copyright (C) 2024, Advanced Micro Devices, Inc. # SPDX-License-Identifier: MIT -import argparse import numpy as np from air.ir import * @@ -8,7 +7,7 @@ from air.dialects.memref import AllocOp, DeallocOp, load, store from air.dialects.func import FuncOp from air.dialects.scf import for_, yield_ -from air.backend.xrt_runner import XRTRunner, type_mapper +from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu range_ = for_ @@ -29,12 +28,7 @@ def build_module(vector_size, num_subvectors): lineWidthInBytes = vector_size // num_subvectors # Memref type definition used by the compute core and external function - mem_space = IntegerAttr.get(T.i32(), MemorySpace.L1) - tensor_type = MemRefType.get( - shape=[lineWidthInBytes], - element_type=xrt_dtype, - memory_space=mem_space, - ) + tensor_type = l1_memref_type([lineWidthInBytes], xrt_dtype) @FuncOp.from_py_func(memrefTyInOut, memrefTyInOut) def copy(arg0, arg1): @@ -75,10 +69,7 @@ def herd_body(_tx, _ty, _sx, _sy): if __name__ == "__main__": - parser = argparse.ArgumentParser( - prog="run.py", - description="Builds, runs, and tests the passthrough_dma example", - ) + parser = make_air_parser("Builds, runs, and tests the passthrough_dma example") parser.add_argument( "-s", "--vector_size", @@ -92,24 +83,6 @@ def herd_body(_tx, _ty, _sx, _sy): default=4, help="The number of sub-vectors to break the vector into", ) - parser.add_argument( - "-v", - "--verbose", - action="store_true", - ) - parser.add_argument( - "-p", - "--print-module-only", - action="store_true", - ) - parser.add_argument( - "--output-format", - type=str, - choices=["xclbin", "elf"], - default="xclbin", - dest="output_format", - help="Output format for the compiled binary (default: xclbin)", - ) args = parser.parse_args() @@ -124,10 +97,12 @@ def herd_body(_tx, _ty, _sx, _sy): input_a[i] = i % 0xFF output_b[i] = i % 0xFF - runner = XRTRunner( - verbose=args.verbose, - output_format=args.output_format, - instance_name="copy", - runtime_loop_tiling_sizes=[4, 4], + exit( + run_on_npu( + args, + mlir_module, + inputs=[input_a], + instance_name="copy", + expected_outputs=[output_b], + ) ) - exit(runner.run_test(mlir_module, inputs=[input_a], expected_outputs=[output_b])) diff --git a/programming_examples/passthrough/passthrough_dma/passthrough_dma.py b/programming_examples/passthrough/passthrough_dma/passthrough_dma.py index 12baf5dad..58ac631a6 100644 --- a/programming_examples/passthrough/passthrough_dma/passthrough_dma.py +++ b/programming_examples/passthrough/passthrough_dma/passthrough_dma.py @@ -1,6 +1,5 @@ # Copyright (C) 2024, Advanced Micro Devices, Inc. # SPDX-License-Identifier: MIT -import argparse import numpy as np from ml_dtypes import bfloat16 @@ -9,7 +8,7 @@ from air.dialects.memref import AllocOp, DeallocOp, load, store from air.dialects.func import FuncOp from air.dialects.scf import for_, yield_ -from air.backend.xrt_runner import XRTRunner, type_mapper +from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu range_ = for_ @@ -36,12 +35,7 @@ def build_module(vector_size, num_subvectors, np_dtype): lineWidthInBytes = vector_size // num_subvectors # Memref type definition used by the compute core and external function - mem_space = IntegerAttr.get(T.i32(), MemorySpace.L1) - tensor_type = MemRefType.get( - shape=[lineWidthInBytes], - element_type=xrt_dtype, - memory_space=mem_space, - ) + tensor_type = l1_memref_type([lineWidthInBytes], xrt_dtype) @FuncOp.from_py_func(memrefTyInOut, memrefTyInOut) def copy(arg0, arg1): @@ -95,10 +89,7 @@ def herd_body(_tx, _ty, _sx, _sy, c, d): if __name__ == "__main__": - parser = argparse.ArgumentParser( - prog="run.py", - description="Builds, runs, and tests the passthrough_dma example", - ) + parser = make_air_parser("Builds, runs, and tests the passthrough_dma example") parser.add_argument( "-s", "--vector_size", @@ -112,24 +103,6 @@ def herd_body(_tx, _ty, _sx, _sy, c, d): default=4, help="The number of sub-vectors to break the vector into", ) - parser.add_argument( - "-v", - "--verbose", - action="store_true", - ) - parser.add_argument( - "-p", - "--print-module-only", - action="store_true", - ) - parser.add_argument( - "--output-format", - type=str, - choices=["xclbin", "elf"], - default="xclbin", - dest="output_format", - help="Output format for the compiled binary (default: xclbin)", - ) parser.add_argument( "-t", "--dtype", @@ -148,10 +121,12 @@ def herd_body(_tx, _ty, _sx, _sy, c, d): input_a = np.arange(args.vector_size, dtype=np_dtype) output_b = np.arange(args.vector_size, dtype=np_dtype) - runner = XRTRunner( - verbose=args.verbose, - output_format=args.output_format, - instance_name="copy", - runtime_loop_tiling_sizes=[4, 4], + exit( + run_on_npu( + args, + mlir_module, + inputs=[input_a], + instance_name="copy", + expected_outputs=[output_b], + ) ) - exit(runner.run_test(mlir_module, inputs=[input_a], expected_outputs=[output_b])) diff --git a/programming_examples/passthrough/passthrough_kernel/passthrough_kernel.py b/programming_examples/passthrough/passthrough_kernel/passthrough_kernel.py index 7d4917b23..ad58ec774 100644 --- a/programming_examples/passthrough/passthrough_kernel/passthrough_kernel.py +++ b/programming_examples/passthrough/passthrough_kernel/passthrough_kernel.py @@ -1,6 +1,5 @@ # Copyright (C) 2024, Advanced Micro Devices, Inc. # SPDX-License-Identifier: MIT -import argparse import numpy as np from air.ir import * @@ -8,7 +7,7 @@ from air.dialects.memref import AllocOp, DeallocOp from air.dialects.func import FuncOp from air.dialects.scf import for_, yield_ -from air.backend.xrt_runner import XRTRunner, type_mapper +from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu range_ = for_ @@ -29,12 +28,7 @@ def build_module(vector_size, num_subvectors): lineWidthInBytes = vector_size // num_subvectors # Memref type definition used by the compute core and external function - mem_space = IntegerAttr.get(T.i32(), MemorySpace.L1) - tensor_type = MemRefType.get( - shape=[lineWidthInBytes], - element_type=xrt_dtype, - memory_space=mem_space, - ) + tensor_type = l1_memref_type([lineWidthInBytes], xrt_dtype) # Function definition of the external function we will call passThroughLine = external_func( @@ -77,10 +71,7 @@ def herd_body(_tx, _ty, _sx, _sy): if __name__ == "__main__": - parser = argparse.ArgumentParser( - prog="run.py", - description="Builds, runs, and tests the passthrough_dma example", - ) + parser = make_air_parser("Builds, runs, and tests the passthrough_dma example") parser.add_argument( "-s", "--vector_size", @@ -94,24 +85,6 @@ def herd_body(_tx, _ty, _sx, _sy): default=4, help="The number of sub-vectors to break the vector into", ) - parser.add_argument( - "-v", - "--verbose", - action="store_true", - ) - parser.add_argument( - "-p", - "--print-module-only", - action="store_true", - ) - parser.add_argument( - "--output-format", - type=str, - choices=["xclbin", "elf"], - default="xclbin", - dest="output_format", - help="Output format for the compiled binary (default: xclbin)", - ) args = parser.parse_args() @@ -123,10 +96,12 @@ def herd_body(_tx, _ty, _sx, _sy): input_a = np.arange(args.vector_size, dtype=INOUT_DATATYPE) output_b = np.arange(args.vector_size, dtype=INOUT_DATATYPE) - runner = XRTRunner( - verbose=args.verbose, - output_format=args.output_format, - instance_name="copy", - runtime_loop_tiling_sizes=[4, 4], + exit( + run_on_npu( + args, + mlir_module, + inputs=[input_a], + instance_name="copy", + expected_outputs=[output_b], + ) ) - exit(runner.run_test(mlir_module, inputs=[input_a], expected_outputs=[output_b])) diff --git a/programming_examples/primitives/scalar_examples/scalar_invsqrt/scalar_invsqrt.py b/programming_examples/primitives/scalar_examples/scalar_invsqrt/scalar_invsqrt.py index 8198645bd..455fd9da0 100644 --- a/programming_examples/primitives/scalar_examples/scalar_invsqrt/scalar_invsqrt.py +++ b/programming_examples/primitives/scalar_examples/scalar_invsqrt/scalar_invsqrt.py @@ -1,18 +1,15 @@ # Copyright (C) 2026, Advanced Micro Devices, Inc. # SPDX-License-Identifier: MIT -import argparse import numpy as np from air.ir import * -from air.dialects.affine import apply as affine_apply from air.dialects.air import * from air.dialects.arith import ConstantOp from air.dialects.memref import AllocOp, DeallocOp, load, store from air.dialects.func import FuncOp from air.dialects.math import rsqrt from air.dialects.scf import for_, yield_ -from air.backend.xrt_runner import XRTRunner, type_mapper -from air.backend.xrt import XRTBackend +from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu range_ = for_ @@ -29,11 +26,7 @@ def build_module(n, tile_n, np_dtype_in): l3memrefTy = MemRefType.get(a_size, xrt_dtype_in) # L1 MemRefTypes - l1MemrefTy = MemRefType.get( - shape=[tile_n], - element_type=xrt_dtype_in, - memory_space=IntegerAttr.get(T.i32(), MemorySpace.L1), - ) + l1MemrefTy = l1_memref_type([tile_n], xrt_dtype_in) @FuncOp.from_py_func(l3memrefTy, l3memrefTy) def scalar_invsqrt(arg0, arg1): @@ -55,21 +48,7 @@ def herd_body( l1_out_data = AllocOp(l1MemrefTy, [], []) for _l_ivx in range_(0, n, tile_n * num_tiles): - - offset_map = AffineMap.get( - 0, - 2, - [ - AffineExpr.get_add( - AffineSymbolExpr.get(0), - AffineExpr.get_mul( - AffineSymbolExpr.get(1), - AffineConstantExpr.get(tile_n), - ), - ) - ], - ) - offset = affine_apply(offset_map, [_l_ivx, _ty]) + offset = tile_offset_1d(_l_ivx, _ty, tile_n) dma_memcpy_nd( l1_a_data, @@ -117,19 +96,8 @@ def herd_body( TILE_N = 1024 INPUT_DATATYPE = np.float32 - parser = argparse.ArgumentParser( - prog="run.py", - description="Builds, runs, and tests the scalar inverse square root (1/sqrt(x)) example", - ) - parser.add_argument( - "-v", - "--verbose", - action="store_true", - ) - parser.add_argument( - "-p", - "--print-module-only", - action="store_true", + parser = make_air_parser( + "Builds, runs, and tests the scalar inverse square root (1/sqrt(x)) example" ) parser.add_argument( "--n", @@ -138,14 +106,6 @@ def herd_body( help="Total number of elements", ) parser.add_argument("--tile-n", type=int, default=TILE_N, help="Tile size") - parser.add_argument( - "--compile-mode", - type=str, - choices=["compile-only", "compile-and-run"], - dest="compile_mode", - default="compile-and-run", - help="Configure to whether to run after compile", - ) args = parser.parse_args() mlir_module = build_module( @@ -162,51 +122,34 @@ def herd_body( np.random.seed(37) input_a = np.random.uniform(0.1, 10.0, args.n).astype(INPUT_DATATYPE) - if args.compile_mode == "compile-and-run": - - # Stochastically sample num_sample results, and pass to XRTRunner backend for verification. - num_samples = 100 - sampled_indices = np.vstack( - [ - np.random.randint(0, args.n, num_samples), # i indices - ] - ) - - # Compute reference results for sampled indices: 1.0 / sqrt(x) - sampled_values = np.array( - [1.0 / np.sqrt(input_a[i]) for i in sampled_indices[0]], - dtype=INPUT_DATATYPE, - ) + # Stochastically sample num_sample results, and pass to XRTRunner backend for verification. + num_samples = 100 + sampled_indices = np.vstack( + [ + np.random.randint(0, args.n, num_samples), # i indices + ] + ) - # Store as a dictionary - sampled_data = { - "shape": (args.n,), - "indices": sampled_indices, - "values": sampled_values, - } - - ###### Compile and test - runner = XRTRunner( - verbose=args.verbose, - omit_while_true_loop=False, - runtime_loop_tiling_sizes=[4, 4], - ) - exit( - runner.run_test( - mlir_module, - inputs=[input_a], - stochastic_expected_outputs=[sampled_data], - rtol=1e-1, - ) - ) + # Compute reference results for sampled indices: 1.0 / sqrt(x) + sampled_values = np.array( + [1.0 / np.sqrt(input_a[i]) for i in sampled_indices[0]], + dtype=INPUT_DATATYPE, + ) - elif args.compile_mode == "compile-only": - ###### Compile only - backend = XRTBackend( - verbose=args.verbose, - omit_while_true_loop=False, - runtime_loop_tiling_sizes=[4, 4], + # Store as a dictionary + sampled_data = { + "shape": (args.n,), + "indices": sampled_indices, + "values": sampled_values, + } + + exit( + run_on_npu( + args, + mlir_module, + inputs=[input_a], + instance_name="scalar_invsqrt", + stochastic_expected_outputs=[sampled_data], + rtol=1e-1, ) - module_function = backend.compile(mlir_module) - - backend.unload() + ) diff --git a/programming_examples/primitives/scalar_examples/scalar_reciprocal/scalar_reciprocal.py b/programming_examples/primitives/scalar_examples/scalar_reciprocal/scalar_reciprocal.py index ed936d96e..72ee100dc 100644 --- a/programming_examples/primitives/scalar_examples/scalar_reciprocal/scalar_reciprocal.py +++ b/programming_examples/primitives/scalar_examples/scalar_reciprocal/scalar_reciprocal.py @@ -1,17 +1,14 @@ # Copyright (C) 2026, Advanced Micro Devices, Inc. # SPDX-License-Identifier: MIT -import argparse import numpy as np from air.ir import * -from air.dialects.affine import apply as affine_apply from air.dialects.air import * from air.dialects.arith import ConstantOp from air.dialects.memref import AllocOp, DeallocOp, load, store from air.dialects.func import FuncOp from air.dialects.scf import for_, yield_ -from air.backend.xrt_runner import XRTRunner, type_mapper -from air.backend.xrt import XRTBackend +from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu range_ = for_ @@ -28,11 +25,7 @@ def build_module(n, tile_n, np_dtype_in): l3memrefTy = MemRefType.get(a_size, xrt_dtype_in) # L1 MemRefTypes - l1MemrefTy = MemRefType.get( - shape=[tile_n], - element_type=xrt_dtype_in, - memory_space=IntegerAttr.get(T.i32(), MemorySpace.L1), - ) + l1MemrefTy = l1_memref_type([tile_n], xrt_dtype_in) @FuncOp.from_py_func(l3memrefTy, l3memrefTy) def scalar_reciprocal(arg0, arg1): @@ -54,21 +47,7 @@ def herd_body( l1_out_data = AllocOp(l1MemrefTy, [], []) for _l_ivx in range_(0, n, tile_n * num_tiles): - - offset_map = AffineMap.get( - 0, - 2, - [ - AffineExpr.get_add( - AffineSymbolExpr.get(0), - AffineExpr.get_mul( - AffineSymbolExpr.get(1), - AffineConstantExpr.get(tile_n), - ), - ) - ], - ) - offset = affine_apply(offset_map, [_l_ivx, _ty]) + offset = tile_offset_1d(_l_ivx, _ty, tile_n) dma_memcpy_nd( l1_a_data, @@ -119,19 +98,8 @@ def herd_body( TILE_N = 1024 INPUT_DATATYPE = np.float32 - parser = argparse.ArgumentParser( - prog="run.py", - description="Builds, runs, and tests the scalar reciprocal (1/x) example", - ) - parser.add_argument( - "-v", - "--verbose", - action="store_true", - ) - parser.add_argument( - "-p", - "--print-module-only", - action="store_true", + parser = make_air_parser( + "Builds, runs, and tests the scalar reciprocal (1/x) example" ) parser.add_argument( "--n", @@ -140,14 +108,6 @@ def herd_body( help="Total number of elements", ) parser.add_argument("--tile-n", type=int, default=TILE_N, help="Tile size") - parser.add_argument( - "--compile-mode", - type=str, - choices=["compile-only", "compile-and-run"], - dest="compile_mode", - default="compile-and-run", - help="Configure to whether to run after compile", - ) args = parser.parse_args() mlir_module = build_module( @@ -164,51 +124,34 @@ def herd_body( # Use a safe range [1, 10] to avoid division by zero or very small numbers input_a = np.random.uniform(1.0, 10.0, args.n).astype(INPUT_DATATYPE) - if args.compile_mode == "compile-and-run": - - # Stochastically sample num_sample results, and pass to XRTRunner backend for verification. - num_samples = 100 - sampled_indices = np.vstack( - [ - np.random.randint(0, args.n, num_samples), # i indices - ] - ) - - # Compute reference results for sampled indices: 1.0 / x - sampled_values = np.array( - [np.float32(1.0) / np.float32(input_a[i]) for i in sampled_indices[0]], - dtype=INPUT_DATATYPE, - ) + # Stochastically sample num_sample results, and pass to XRTRunner backend for verification. + num_samples = 100 + sampled_indices = np.vstack( + [ + np.random.randint(0, args.n, num_samples), # i indices + ] + ) - # Store as a dictionary - sampled_data = { - "shape": (args.n,), - "indices": sampled_indices, - "values": sampled_values, - } - - ###### Compile and test - runner = XRTRunner( - verbose=args.verbose, - omit_while_true_loop=False, - runtime_loop_tiling_sizes=[4, 4], - ) - exit( - runner.run_test( - mlir_module, - inputs=[input_a], - stochastic_expected_outputs=[sampled_data], - rtol=1e-5, - ) - ) + # Compute reference results for sampled indices: 1.0 / x + sampled_values = np.array( + [np.float32(1.0) / np.float32(input_a[i]) for i in sampled_indices[0]], + dtype=INPUT_DATATYPE, + ) - elif args.compile_mode == "compile-only": - ###### Compile only - backend = XRTBackend( - verbose=args.verbose, - omit_while_true_loop=False, - runtime_loop_tiling_sizes=[4, 4], + # Store as a dictionary + sampled_data = { + "shape": (args.n,), + "indices": sampled_indices, + "values": sampled_values, + } + + exit( + run_on_npu( + args, + mlir_module, + inputs=[input_a], + instance_name="scalar_reciprocal", + stochastic_expected_outputs=[sampled_data], + rtol=1e-5, ) - module_function = backend.compile(mlir_module) - - backend.unload() + ) diff --git a/programming_examples/primitives/scalar_examples/scalar_shift_saturate/scalar_shift_saturate.py b/programming_examples/primitives/scalar_examples/scalar_shift_saturate/scalar_shift_saturate.py index 5cff5dce4..a0b700409 100644 --- a/programming_examples/primitives/scalar_examples/scalar_shift_saturate/scalar_shift_saturate.py +++ b/programming_examples/primitives/scalar_examples/scalar_shift_saturate/scalar_shift_saturate.py @@ -17,19 +17,16 @@ Uses a 1x2 AIE herd with DMA transfers between L3 and L1 memory. """ -import argparse import numpy as np from air.ir import * -from air.dialects.affine import apply as affine_apply from air.dialects.air import * from air.dialects import arith from air.dialects.arith import ConstantOp from air.dialects.memref import AllocOp, DeallocOp, load, store from air.dialects.func import FuncOp from air.dialects.scf import for_, yield_ -from air.backend.xrt_runner import XRTRunner, type_mapper -from air.backend.xrt import XRTBackend +from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu range_ = for_ @@ -45,11 +42,7 @@ def build_module(n, tile_n, np_dtype, shift_amount=4): l3memrefTy = MemRefType.get([n], xrt_dtype) # L1 MemRefTypes - l1MemrefTy = MemRefType.get( - shape=[tile_n], - element_type=xrt_dtype, - memory_space=IntegerAttr.get(T.i32(), MemorySpace.L1), - ) + l1MemrefTy = l1_memref_type([tile_n], xrt_dtype) @FuncOp.from_py_func(l3memrefTy, l3memrefTy) def scalar_shift_saturate(arg0, arg1): @@ -64,21 +57,7 @@ def herd_body(_tx, _ty, _sx, _sy, _l3_in, _l3_out): l1_out = AllocOp(l1MemrefTy, [], []) for _l_ivx in range_(0, n, tile_n * num_tiles): - - offset_map = AffineMap.get( - 0, - 2, - [ - AffineExpr.get_add( - AffineSymbolExpr.get(0), - AffineExpr.get_mul( - AffineSymbolExpr.get(1), - AffineConstantExpr.get(tile_n), - ), - ) - ], - ) - offset = affine_apply(offset_map, [_l_ivx, _ty]) + offset = tile_offset_1d(_l_ivx, _ty, tile_n) dma_memcpy_nd( l1_in, @@ -138,12 +117,9 @@ def herd_body(_tx, _ty, _sx, _sy, _l3_in, _l3_out): SHIFT_AMOUNT = 4 INPUT_DATATYPE = np.int32 - parser = argparse.ArgumentParser( - prog="run.py", - description="Builds, runs, and tests the scalar shift+saturate example", + parser = make_air_parser( + "Builds, runs, and tests the scalar shift+saturate example" ) - parser.add_argument("-v", "--verbose", action="store_true") - parser.add_argument("-p", "--print-module-only", action="store_true") parser.add_argument("--n", type=int, default=N, help="Total number of elements") parser.add_argument("--tile-n", type=int, default=TILE_N, help="Tile size") parser.add_argument( @@ -152,20 +128,6 @@ def herd_body(_tx, _ty, _sx, _sy, _l3_in, _l3_out): default=SHIFT_AMOUNT, help="Right shift amount (quantization scale factor)", ) - parser.add_argument( - "--compile-mode", - type=str, - choices=["compile-only", "compile-and-run"], - dest="compile_mode", - default="compile-and-run", - ) - parser.add_argument( - "--output-format", - type=str, - choices=["xclbin", "elf"], - default="xclbin", - dest="output_format", - ) args = parser.parse_args() @@ -181,53 +143,37 @@ def herd_body(_tx, _ty, _sx, _sy, _l3_in, _l3_out): max_val = (127 << args.shift_amount) + (1 << args.shift_amount) input_a = np.random.randint(-max_val, max_val, args.n, dtype=INPUT_DATATYPE) - if args.compile_mode == "compile-and-run": - num_samples = 100 - sampled_indices = np.vstack([np.random.randint(0, args.n, num_samples)]) - - # Reference: SRS (Shift-Round-Saturate) with positive_inf rounding. - # AIECoreToStandard sets rounding mode 9 (positive_inf) for integer SRS, - # which rounds toward positive infinity at the midpoint. - def ref_shift_saturate(x, shift): - shifted = (x + (1 << (shift - 1))) >> shift - return np.clip(shifted, -128, 127).astype(np.int8).astype(np.int32) - - sampled_values = np.array( - [ - ref_shift_saturate(input_a[i], args.shift_amount) - for i in zip(*sampled_indices) - ], - dtype=INPUT_DATATYPE, - ) - sampled_data = { - "shape": (args.n,), - "indices": sampled_indices, - "values": sampled_values, - } - - runner = XRTRunner( - verbose=args.verbose, - omit_while_true_loop=False, - output_format=args.output_format, + num_samples = 100 + sampled_indices = np.vstack([np.random.randint(0, args.n, num_samples)]) + + # Reference: SRS (Shift-Round-Saturate) with positive_inf rounding. + # AIECoreToStandard sets rounding mode 9 (positive_inf) for integer SRS, + # which rounds toward positive infinity at the midpoint. + def ref_shift_saturate(x, shift): + shifted = (x + (1 << (shift - 1))) >> shift + return np.clip(shifted, -128, 127).astype(np.int8).astype(np.int32) + + sampled_values = np.array( + [ + ref_shift_saturate(input_a[i], args.shift_amount) + for i in zip(*sampled_indices) + ], + dtype=INPUT_DATATYPE, + ) + sampled_data = { + "shape": (args.n,), + "indices": sampled_indices, + "values": sampled_values, + } + + exit( + run_on_npu( + args, + mlir_module, + inputs=[input_a], instance_name="scalar_shift_saturate", - runtime_loop_tiling_sizes=[4, 4], - ) - exit( - runner.run_test( - mlir_module, - inputs=[input_a], - stochastic_expected_outputs=[sampled_data], - rtol=0, - atol=0, - ) + stochastic_expected_outputs=[sampled_data], + rtol=0, + atol=0, ) - - elif args.compile_mode == "compile-only": - backend = XRTBackend( - verbose=args.verbose, - omit_while_true_loop=False, - output_format=args.output_format, - runtime_loop_tiling_sizes=[4, 4], - ) - module_function = backend.compile(mlir_module) - backend.unload() + ) diff --git a/programming_examples/primitives/vector_examples/vector_add/vector_add.py b/programming_examples/primitives/vector_examples/vector_add/vector_add.py index 0008f6aaf..f587978c8 100644 --- a/programming_examples/primitives/vector_examples/vector_add/vector_add.py +++ b/programming_examples/primitives/vector_examples/vector_add/vector_add.py @@ -1,18 +1,36 @@ # Copyright (C) 2025, Advanced Micro Devices, Inc. # SPDX-License-Identifier: MIT -import argparse +import os +import sys + +sys.path.insert( + 0, + os.path.dirname( + os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + ), +) + from ml_dtypes import bfloat16 from air.ir import * -from air.dialects.affine import apply as affine_apply from air.dialects.air import * from air.dialects.arith import ConstantOp -from air.dialects.memref import AllocOp, DeallocOp, load, store, subview -from air.dialects.vector import transfer_read, transfer_write +from air.dialects.memref import AllocOp, DeallocOp from air.dialects.func import FuncOp from air.dialects.scf import for_, yield_ -from air.backend.xrt_runner import XRTRunner, type_mapper -from air.backend.xrt import XRTBackend +from air.backend.xrt_runner import type_mapper +from utils import ( + make_l1_memref, + make_vec_type, + identity_map_1d, + tiled_1d_offset, + vec_read, + vec_write, + make_air_parser, + run_on_npu, + stochastic_check, + check_print_module, +) import numpy as np @@ -24,23 +42,16 @@ @module_builder def build_module(n, tile_n, np_dtype_in, vector_size=16): a_size = [n] - b_size = a_size - out_size = a_size xrt_dtype_in = type_mapper(np_dtype_in) num_tiles = 2 assert n % (tile_n * num_tiles) == 0 VECTOR_SIZE = vector_size index_type = IndexType.get() - # L3 MemRefTypes l3memrefTy = MemRefType.get(a_size, xrt_dtype_in) - - # L1 MemRefTypes - l1MemrefTy = MemRefType.get( - shape=[tile_n], - element_type=xrt_dtype_in, - memory_space=IntegerAttr.get(T.i32(), MemorySpace.L1), - ) + l1MemrefTy = make_l1_memref([tile_n], xrt_dtype_in) + vecTy = make_vec_type(VECTOR_SIZE, xrt_dtype_in) + imap = identity_map_1d() @FuncOp.from_py_func(l3memrefTy, l3memrefTy, l3memrefTy) def vector_add(arg0, arg1, arg2): @@ -63,97 +74,37 @@ def herd_body( l1_out_data = AllocOp(l1MemrefTy, [], []) for _l_ivx in range_(0, n, tile_n * num_tiles): - - offset_map = AffineMap.get( - 0, - 2, - [ - AffineExpr.get_add( - AffineSymbolExpr.get(0), - AffineExpr.get_mul( - AffineSymbolExpr.get(1), - AffineConstantExpr.get(tile_n), - ), - ) - ], - ) - offset = affine_apply(offset_map, [_l_ivx, _ty]) + offset = tiled_1d_offset(_l_ivx, _ty, tile_n) dma_memcpy_nd( l1_a_data, _l3_a, - src_offsets=[ - offset, - ], + src_offsets=[offset], src_sizes=[tile_n], src_strides=[1], ) dma_memcpy_nd( l1_b_data, _l3_b, - src_offsets=[ - offset, - ], + src_offsets=[offset], src_sizes=[tile_n], src_strides=[1], ) c0 = ConstantOp(index_type, 0) - c1 = ConstantOp(index_type, 1) cVecSize = ConstantOp(index_type, VECTOR_SIZE) cTileN = ConstantOp(index_type, tile_n) + cst0 = arith.ConstantOp(xrt_dtype_in, 0.0) for j in range_(c0, cTileN, cVecSize): - sub_a_vec = subview( - l1_a_data.result, - [j], - [VECTOR_SIZE], - [1], - ) - sub_b_vec = subview( - l1_b_data.result, - [j], - [VECTOR_SIZE], - [1], - ) - sub_c_vec = subview( - l1_out_data.result, - [j], - [VECTOR_SIZE], - [1], - ) - cst0 = arith.ConstantOp(xrt_dtype_in, 0.0) - v_a = transfer_read( - VectorType.get([VECTOR_SIZE], xrt_dtype_in), - sub_a_vec, - [c0], - AffineMapAttr.get(AffineMap.get_identity(1)), - cst0, - [True], - ) - v_b = transfer_read( - VectorType.get([VECTOR_SIZE], xrt_dtype_in), - sub_b_vec, - [c0], - AffineMapAttr.get(AffineMap.get_identity(1)), - cst0, - [True], - ) + v_a = vec_read(l1_a_data, j, VECTOR_SIZE, c0, vecTy, cst0, imap) + v_b = vec_read(l1_b_data, j, VECTOR_SIZE, c0, vecTy, cst0, imap) v_c = arith.AddFOp(v_a, v_b) - transfer_write( - None, - v_c, - sub_c_vec, - [c0], - AffineMapAttr.get(AffineMap.get_identity(1)), - [True], - ) + vec_write(v_c, l1_out_data, j, VECTOR_SIZE, c0, imap) yield_([]) dma_memcpy_nd( _l3_c, l1_out_data, - dst_offsets=[ - offset, - ], + dst_offsets=[offset], dst_sizes=[tile_n], dst_strides=[1], ) @@ -165,26 +116,12 @@ def herd_body( if __name__ == "__main__": - # Default values. N = 65536 TILE_N = 1024 VECTOR_SIZE = 16 INPUT_DATATYPE = bfloat16 - parser = argparse.ArgumentParser( - prog="run.py", - description="Builds, runs, and tests the passthrough_dma example", - ) - parser.add_argument( - "-v", - "--verbose", - action="store_true", - ) - parser.add_argument( - "-p", - "--print-module-only", - action="store_true", - ) + parser = make_air_parser("Builds, runs, and tests the passthrough_dma example") parser.add_argument( "--n", type=int, @@ -198,22 +135,6 @@ def herd_body( default=VECTOR_SIZE, help="Vector size for SIMD operations", ) - parser.add_argument( - "--compile-mode", - type=str, - choices=["compile-only", "compile-and-run"], - dest="compile_mode", - default="compile-and-run", - help="Configure to whether to run after compile", - ) - parser.add_argument( - "--output-format", - type=str, - choices=["xclbin", "elf"], - default="xclbin", - dest="output_format", - help="Output format for the compiled binary (default: xclbin)", - ) parser.add_argument( "--bf16-emulation", dest="bf16_emulation", @@ -234,65 +155,24 @@ def herd_body( INPUT_DATATYPE, args.vector_size, ) - if args.print_module_only: - print(mlir_module) - exit(0) + check_print_module(mlir_module, args) input_a = np.arange(0, args.n, dtype=np.int64) input_a = input_a.astype(INPUT_DATATYPE) input_b = np.arange(0, args.n, dtype=np.int64) input_b = input_b.astype(INPUT_DATATYPE) - if args.compile_mode == "compile-and-run": - - # Stochastically sample num_sample results, and pass to XRTRunner backend for verification. - num_samples = 100 - sampled_indices = np.vstack( - [ - np.random.randint(0, args.n, num_samples), # i indices - ] - ) - - # Compute reference results for sampled indices - sampled_values = np.array( - [input_a[i] + input_b[i] for i in zip(*sampled_indices)], - dtype=INPUT_DATATYPE, - ) - - # Store as a dictionary - sampled_data = { - "shape": (args.n,), - "indices": sampled_indices, - "values": sampled_values, - } - - ###### Compile and test - runner = XRTRunner( - verbose=args.verbose, - omit_while_true_loop=False, - output_format=args.output_format, + sampled_data = stochastic_check( + [input_a, input_b], args.n, lambda a, b: a + b, INPUT_DATATYPE + ) + exit( + run_on_npu( + args, + mlir_module, + inputs=[input_a, input_b], instance_name="vector_add", + stochastic_expected_outputs=[sampled_data], + rtol=5e-2 if bf16_emulation else 1e-3, bf16_emulation=bf16_emulation, - runtime_loop_tiling_sizes=[4, 4], - ) - exit( - runner.run_test( - mlir_module, - inputs=[input_a, input_b], - stochastic_expected_outputs=[sampled_data], - rtol=5e-2 if bf16_emulation else 1e-3, - ) ) - - elif args.compile_mode == "compile-only": - ###### Compile only - backend = XRTBackend( - verbose=args.verbose, - omit_while_true_loop=False, - output_format=args.output_format, - bf16_emulation=bf16_emulation, - runtime_loop_tiling_sizes=[4, 4], - ) - module_function = backend.compile(mlir_module) - - backend.unload() + ) diff --git a/programming_examples/primitives/vector_examples/vector_broadcast_scalar/vector_broadcast_scalar.py b/programming_examples/primitives/vector_examples/vector_broadcast_scalar/vector_broadcast_scalar.py index 715225240..447494a26 100644 --- a/programming_examples/primitives/vector_examples/vector_broadcast_scalar/vector_broadcast_scalar.py +++ b/programming_examples/primitives/vector_examples/vector_broadcast_scalar/vector_broadcast_scalar.py @@ -1,10 +1,18 @@ # Copyright (C) 2025, Advanced Micro Devices, Inc. # SPDX-License-Identifier: MIT -import argparse +import os +import sys + +sys.path.insert( + 0, + os.path.dirname( + os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + ), +) + from ml_dtypes import bfloat16 from air.ir import * -from air.dialects.affine import apply as affine_apply from air.dialects.air import * from air.dialects.arith import ConstantOp from air.dialects.memref import AllocOp, DeallocOp, load, store, subview, collapse_shape @@ -19,8 +27,13 @@ from air.dialects.func import FuncOp from air.dialects.scf import for_, yield_ from air.dialects.math import exp -from air.backend.xrt_runner import XRTRunner, type_mapper -from air.backend.xrt import XRTBackend +from air.backend.xrt_runner import type_mapper +from utils import ( + tiled_1d_offset, + make_air_parser, + run_on_npu, + check_print_module, +) import numpy as np @@ -44,16 +57,8 @@ def build_module(m, n, tile_m, np_dtype_in): l3outputMemrefTy = MemRefType.get(out_size, xrt_dtype_in) # L1 MemRefTypes - l1MemrefTy = MemRefType.get( - shape=[tile_m, 1], - element_type=xrt_dtype_in, - memory_space=IntegerAttr.get(T.i32(), MemorySpace.L1), - ) - l1outputMemrefTy = MemRefType.get( - shape=[tile_m, n], - element_type=xrt_dtype_in, - memory_space=IntegerAttr.get(T.i32(), MemorySpace.L1), - ) + l1MemrefTy = l1_memref_type([tile_m, 1], xrt_dtype_in) + l1outputMemrefTy = l1_memref_type([tile_m, n], xrt_dtype_in) @FuncOp.from_py_func(l3memrefTy, l3outputMemrefTy) def vector_broadcast_scalar(arg0, arg2): @@ -74,28 +79,12 @@ def herd_body( l1_out_data = AllocOp(l1outputMemrefTy, [], []) for _l_ivx in range_(0, m, tile_m * num_tiles): - - offset_map = AffineMap.get( - 0, - 2, - [ - AffineExpr.get_add( - AffineSymbolExpr.get(0), - AffineExpr.get_mul( - AffineSymbolExpr.get(1), - AffineConstantExpr.get(tile_m), - ), - ) - ], - ) - offset = affine_apply(offset_map, [_l_ivx, _ty]) + offset = tiled_1d_offset(_l_ivx, _ty, tile_m) dma_memcpy_nd( l1_a_data, _l3_a, - src_offsets=[ - offset, - ], + src_offsets=[offset], src_sizes=[tile_m], src_strides=[1], ) @@ -143,24 +132,15 @@ def herd_body( ) cst0 = arith.ConstantOp(xrt_dtype_in, 0.0) scalar = load(collapse_a, [c0]) - # v_a = transfer_read( - # VectorType.get([n], xrt_dtype_in), - # collapse_a, - # [c0], - # AffineMapAttr.get(AffineMap.get_identity(1)), - # cst0, - # [True], - # ) v_c_broadcast = broadcast(VectorType.get([n], xrt_dtype_in), scalar) - # store(v_c_broadcast, collapse_c, [c0]) transfer_write( None, v_c_broadcast, collapse_c, [c0], - AffineMapAttr.get(AffineMap.get_identity(1)), + identity_map_attr(), [True], ) yield_([]) @@ -179,26 +159,12 @@ def herd_body( if __name__ == "__main__": - # Default values. M = 65536 N = 16 TILE_M = 256 INPUT_DATATYPE = bfloat16 - parser = argparse.ArgumentParser( - prog="run.py", - description="Builds, runs, and tests the passthrough_dma example", - ) - parser.add_argument( - "-v", - "--verbose", - action="store_true", - ) - parser.add_argument( - "-p", - "--print-module-only", - action="store_true", - ) + parser = make_air_parser("Builds, runs, and tests the passthrough_dma example") parser.add_argument( "--m", type=int, @@ -212,22 +178,6 @@ def herd_body( help="Output size (dimension N, the broadcasted dimension)", ) parser.add_argument("--tile-m", type=int, default=TILE_M, help="Tile size M") - parser.add_argument( - "--compile-mode", - type=str, - choices=["compile-only", "compile-and-run"], - dest="compile_mode", - default="compile-and-run", - help="Configure to whether to run after compile", - ) - parser.add_argument( - "--output-format", - type=str, - choices=["xclbin", "elf"], - default="xclbin", - dest="output_format", - help="Output format for the compiled binary (default: xclbin)", - ) args = parser.parse_args() @@ -237,55 +187,33 @@ def herd_body( args.tile_m, INPUT_DATATYPE, ) - if args.print_module_only: - print(mlir_module) - exit(0) + check_print_module(mlir_module, args) input_a = np.arange(0, (args.m), dtype=INPUT_DATATYPE).reshape(args.m, 1) - if args.compile_mode == "compile-and-run": - - # Stochastically sample num_sample results, and pass to XRTRunner backend for verification. - num_samples = 100 - sampled_indices = np.vstack([np.random.randint(0, args.m, num_samples)]) - - # Compute reference results for sampled indices - sampled_values = np.array( - [np.broadcast_to(input_a[i], (args.n,)) for i in zip(*sampled_indices)], - dtype=INPUT_DATATYPE, - ) + # Stochastically sample num_sample results for verification. + num_samples = 100 + sampled_indices = np.vstack([np.random.randint(0, args.m, num_samples)]) - # Store as a dictionary - sampled_data = { - "shape": (args.m, args.n), - "indices": sampled_indices, - "values": sampled_values, - } + # Compute reference results for sampled indices + sampled_values = np.array( + [np.broadcast_to(input_a[i], (args.n,)) for i in zip(*sampled_indices)], + dtype=INPUT_DATATYPE, + ) - ###### Compile and test - runner = XRTRunner( - verbose=args.verbose, - omit_while_true_loop=False, - output_format=args.output_format, + # Store as a dictionary + sampled_data = { + "shape": (args.m, args.n), + "indices": sampled_indices, + "values": sampled_values, + } + + exit( + run_on_npu( + args, + mlir_module, + inputs=[input_a], instance_name="vector_broadcast_scalar", - runtime_loop_tiling_sizes=[4, 4], - ) - exit( - runner.run_test( - mlir_module, - inputs=[input_a], - stochastic_expected_outputs=[sampled_data], - ) - ) - - elif args.compile_mode == "compile-only": - ###### Compile only - backend = XRTBackend( - verbose=args.verbose, - omit_while_true_loop=False, - output_format=args.output_format, - runtime_loop_tiling_sizes=[4, 4], + stochastic_expected_outputs=[sampled_data], ) - module_function = backend.compile(mlir_module) - - backend.unload() + ) diff --git a/programming_examples/primitives/vector_examples/vector_div/vector_div.py b/programming_examples/primitives/vector_examples/vector_div/vector_div.py index cec991145..129bf035c 100644 --- a/programming_examples/primitives/vector_examples/vector_div/vector_div.py +++ b/programming_examples/primitives/vector_examples/vector_div/vector_div.py @@ -1,18 +1,36 @@ # Copyright (C) 2025, Advanced Micro Devices, Inc. # SPDX-License-Identifier: MIT -import argparse -from ml_dtypes import bfloat16 +import os +import sys + +sys.path.insert( + 0, + os.path.dirname( + os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + ), +) from air.ir import * -from air.dialects.affine import apply as affine_apply from air.dialects.air import * from air.dialects.arith import ConstantOp -from air.dialects.memref import AllocOp, DeallocOp, load, store, subview -from air.dialects.vector import transfer_read, transfer_write +from air.dialects.memref import AllocOp, DeallocOp from air.dialects.func import FuncOp from air.dialects.scf import for_, yield_ -from air.backend.xrt_runner import XRTRunner, type_mapper -from air.backend.xrt import XRTBackend +from air.backend.xrt_runner import type_mapper +from utils import ( + make_l1_memref, + make_vec_type, + identity_map_1d, + tiled_1d_offset, + vec_read, + vec_write, + make_air_parser, + run_on_npu, + stochastic_check, + check_print_module, +) + +import numpy as np range_ = for_ @@ -20,8 +38,6 @@ @module_builder def build_module(n, tile_n, np_dtype_in, arch="aie2"): a_size = [n] - b_size = a_size - out_size = a_size xrt_dtype_in = type_mapper(np_dtype_in) num_tiles = 2 assert n % (tile_n * num_tiles) == 0 @@ -33,15 +49,10 @@ def build_module(n, tile_n, np_dtype_in, arch="aie2"): VECTOR_SIZE = arch_vector_sizes.get(arch, 16) # default to 16 if unknown index_type = IndexType.get() - # L3 MemRefTypes l3memrefTy = MemRefType.get(a_size, xrt_dtype_in) - - # L1 MemRefTypes - l1MemrefTy = MemRefType.get( - shape=[tile_n], - element_type=xrt_dtype_in, - memory_space=IntegerAttr.get(T.i32(), MemorySpace.L1), - ) + l1MemrefTy = make_l1_memref([tile_n], xrt_dtype_in) + vecTy = make_vec_type(VECTOR_SIZE, xrt_dtype_in) + imap = identity_map_1d() @FuncOp.from_py_func(l3memrefTy, l3memrefTy, l3memrefTy) def vector_div(arg0, arg1, arg2): @@ -65,97 +76,37 @@ def herd_body( l1_out_data = AllocOp(l1MemrefTy, [], []) for _l_ivx in range_(0, n, tile_n * num_tiles): - - offset_map = AffineMap.get( - 0, - 2, - [ - AffineExpr.get_add( - AffineSymbolExpr.get(0), - AffineExpr.get_mul( - AffineSymbolExpr.get(1), - AffineConstantExpr.get(tile_n), - ), - ) - ], - ) - offset = affine_apply(offset_map, [_l_ivx, _ty]) + offset = tiled_1d_offset(_l_ivx, _ty, tile_n) dma_memcpy_nd( l1_a_data, _l3_a, - src_offsets=[ - offset, - ], + src_offsets=[offset], src_sizes=[tile_n], src_strides=[1], ) dma_memcpy_nd( l1_b_data, _l3_b, - src_offsets=[ - offset, - ], + src_offsets=[offset], src_sizes=[tile_n], src_strides=[1], ) c0 = ConstantOp(index_type, 0) - c1 = ConstantOp(index_type, 1) cVecSize = ConstantOp(index_type, VECTOR_SIZE) cTileN = ConstantOp(index_type, tile_n) + cst0 = arith.ConstantOp(xrt_dtype_in, 0.0) for j in range_(c0, cTileN, cVecSize): - sub_a_vec = subview( - l1_a_data.result, - [j], - [VECTOR_SIZE], - [1], - ) - sub_b_vec = subview( - l1_b_data.result, - [j], - [VECTOR_SIZE], - [1], - ) - sub_c_vec = subview( - l1_out_data.result, - [j], - [VECTOR_SIZE], - [1], - ) - cst0 = arith.ConstantOp(xrt_dtype_in, 0.0) - v_a = transfer_read( - VectorType.get([VECTOR_SIZE], xrt_dtype_in), - sub_a_vec, - [c0], - AffineMapAttr.get(AffineMap.get_identity(1)), - cst0, - [True], - ) - v_b = transfer_read( - VectorType.get([VECTOR_SIZE], xrt_dtype_in), - sub_b_vec, - [c0], - AffineMapAttr.get(AffineMap.get_identity(1)), - cst0, - [True], - ) + v_a = vec_read(l1_a_data, j, VECTOR_SIZE, c0, vecTy, cst0, imap) + v_b = vec_read(l1_b_data, j, VECTOR_SIZE, c0, vecTy, cst0, imap) v_c = arith.DivFOp(v_a, v_b) - transfer_write( - None, - v_c, - sub_c_vec, - [c0], - AffineMapAttr.get(AffineMap.get_identity(1)), - [True], - ) + vec_write(v_c, l1_out_data, j, VECTOR_SIZE, c0, imap) yield_([]) dma_memcpy_nd( _l3_c, l1_out_data, - dst_offsets=[ - offset, - ], + dst_offsets=[offset], dst_sizes=[tile_n], dst_strides=[1], ) @@ -167,25 +118,11 @@ def herd_body( if __name__ == "__main__": - # Default values. N = 65536 TILE_N = 1024 INPUT_DATATYPE = np.float32 - parser = argparse.ArgumentParser( - prog="run.py", - description="Builds, runs, and tests the vector division example", - ) - parser.add_argument( - "-v", - "--verbose", - action="store_true", - ) - parser.add_argument( - "-p", - "--print-module-only", - action="store_true", - ) + parser = make_air_parser("Builds, runs, and tests the vector division example") parser.add_argument( "--n", type=int, @@ -200,22 +137,6 @@ def herd_body( default="aie2", help="Target AIE architecture (aie2 or aie2p)", ) - parser.add_argument( - "--compile-mode", - type=str, - choices=["compile-only", "compile-and-run"], - dest="compile_mode", - default="compile-and-run", - help="Configure to whether to run after compile", - ) - parser.add_argument( - "--output-format", - type=str, - choices=["xclbin", "elf"], - default="xclbin", - dest="output_format", - help="Output format for the compiled binary (default: xclbin)", - ) args = parser.parse_args() @@ -225,9 +146,7 @@ def herd_body( INPUT_DATATYPE, args.arch, ) - if args.print_module_only: - print(mlir_module) - exit(0) + check_print_module(mlir_module, args) # Generate random input vectors with fixed seed for reproducibility np.random.seed(37) @@ -235,54 +154,16 @@ def herd_body( input_a = np.random.uniform(0.1, 10.0, args.n).astype(INPUT_DATATYPE) input_b = np.random.uniform(1.0, 10.0, args.n).astype(INPUT_DATATYPE) - if args.compile_mode == "compile-and-run": - - # Stochastically sample num_sample results, and pass to XRTRunner backend for verification. - num_samples = 100 - sampled_indices = np.vstack( - [ - np.random.randint(0, args.n, num_samples), # i indices - ] - ) - - # Compute reference results for sampled indices - sampled_values = np.array( - [input_a[i] / input_b[i] for i in zip(*sampled_indices)], - dtype=INPUT_DATATYPE, - ) - - # Store as a dictionary - sampled_data = { - "shape": (args.n,), - "indices": sampled_indices, - "values": sampled_values, - } - - ###### Compile and test - runner = XRTRunner( - verbose=args.verbose, - omit_while_true_loop=False, - output_format=args.output_format, + sampled_data = stochastic_check( + [input_a, input_b], args.n, lambda a, b: a / b, INPUT_DATATYPE + ) + exit( + run_on_npu( + args, + mlir_module, + inputs=[input_a, input_b], instance_name="vector_div", - runtime_loop_tiling_sizes=[4, 4], + stochastic_expected_outputs=[sampled_data], + rtol=1e-2, ) - exit( - runner.run_test( - mlir_module, - inputs=[input_a, input_b], - stochastic_expected_outputs=[sampled_data], - rtol=1e-2, - ) - ) - - elif args.compile_mode == "compile-only": - ###### Compile only - backend = XRTBackend( - verbose=args.verbose, - omit_while_true_loop=False, - output_format=args.output_format, - runtime_loop_tiling_sizes=[4, 4], - ) - module_function = backend.compile(mlir_module) - - backend.unload() + ) diff --git a/programming_examples/primitives/vector_examples/vector_exp/vector_exp.py b/programming_examples/primitives/vector_examples/vector_exp/vector_exp.py index 51463700f..b24a54793 100644 --- a/programming_examples/primitives/vector_examples/vector_exp/vector_exp.py +++ b/programming_examples/primitives/vector_examples/vector_exp/vector_exp.py @@ -1,19 +1,37 @@ # Copyright (C) 2025, Advanced Micro Devices, Inc. # SPDX-License-Identifier: MIT -import argparse +import os +import sys + +sys.path.insert( + 0, + os.path.dirname( + os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + ), +) + from ml_dtypes import bfloat16 from air.ir import * -from air.dialects.affine import apply as affine_apply from air.dialects.air import * from air.dialects.arith import ConstantOp -from air.dialects.memref import AllocOp, DeallocOp, load, store, subview -from air.dialects.vector import transfer_read, transfer_write +from air.dialects.memref import AllocOp, DeallocOp from air.dialects.func import FuncOp from air.dialects.scf import for_, yield_ from air.dialects.math import exp -from air.backend.xrt_runner import XRTRunner, type_mapper -from air.backend.xrt import XRTBackend +from air.backend.xrt_runner import type_mapper +from utils import ( + make_l1_memref, + make_vec_type, + identity_map_1d, + tiled_1d_offset, + vec_read, + vec_write, + make_air_parser, + run_on_npu, + stochastic_check, + check_print_module, +) import numpy as np @@ -25,22 +43,16 @@ @module_builder def build_module(n, tile_n, np_dtype_in, arch="aie2", vector_size=16): a_size = [n] - out_size = a_size xrt_dtype_in = type_mapper(np_dtype_in) num_tiles = 2 assert n % (tile_n * num_tiles) == 0 VECTOR_SIZE = vector_size index_type = IndexType.get() - # L3 MemRefTypes l3memrefTy = MemRefType.get(a_size, xrt_dtype_in) - - # L1 MemRefTypes - l1MemrefTy = MemRefType.get( - shape=[tile_n], - element_type=xrt_dtype_in, - memory_space=IntegerAttr.get(T.i32(), MemorySpace.L1), - ) + l1MemrefTy = make_l1_memref([tile_n], xrt_dtype_in) + vecTy = make_vec_type(VECTOR_SIZE, xrt_dtype_in) + imap = identity_map_1d() @FuncOp.from_py_func(l3memrefTy, l3memrefTy) def vector_exp(arg0, arg2): @@ -66,74 +78,29 @@ def herd_body( l1_out_data = AllocOp(l1MemrefTy, [], []) for _l_ivx in range_(0, n, tile_n * num_tiles): - - offset_map = AffineMap.get( - 0, - 2, - [ - AffineExpr.get_add( - AffineSymbolExpr.get(0), - AffineExpr.get_mul( - AffineSymbolExpr.get(1), - AffineConstantExpr.get(tile_n), - ), - ) - ], - ) - offset = affine_apply(offset_map, [_l_ivx, _ty]) + offset = tiled_1d_offset(_l_ivx, _ty, tile_n) dma_memcpy_nd( l1_a_data, _l3_a, - src_offsets=[ - offset, - ], + src_offsets=[offset], src_sizes=[tile_n], src_strides=[1], ) c0 = ConstantOp(index_type, 0) - c1 = ConstantOp(index_type, 1) cVecSize = ConstantOp(index_type, VECTOR_SIZE) cTileN = ConstantOp(index_type, tile_n) + cst0 = arith.ConstantOp(xrt_dtype_in, 0.0) for j in range_(c0, cTileN, cVecSize): - sub_a_vec = subview( - l1_a_data.result, - [j], - [VECTOR_SIZE], - [1], - ) - sub_c_vec = subview( - l1_out_data.result, - [j], - [VECTOR_SIZE], - [1], - ) - cst0 = arith.ConstantOp(xrt_dtype_in, 0.0) - v_a = transfer_read( - VectorType.get([VECTOR_SIZE], xrt_dtype_in), - sub_a_vec, - [c0], - AffineMapAttr.get(AffineMap.get_identity(1)), - cst0, - [True], - ) + v_a = vec_read(l1_a_data, j, VECTOR_SIZE, c0, vecTy, cst0, imap) v_c = exp(v_a) - transfer_write( - None, - v_c, - sub_c_vec, - [c0], - AffineMapAttr.get(AffineMap.get_identity(1)), - [True], - ) + vec_write(v_c, l1_out_data, j, VECTOR_SIZE, c0, imap) yield_([]) dma_memcpy_nd( _l3_c, l1_out_data, - dst_offsets=[ - offset, - ], + dst_offsets=[offset], dst_sizes=[tile_n], dst_strides=[1], ) @@ -144,26 +111,12 @@ def herd_body( if __name__ == "__main__": - # Default values. N = 65536 TILE_N = 1024 VECTOR_SIZE = 16 INPUT_DATATYPE = bfloat16 - parser = argparse.ArgumentParser( - prog="run.py", - description="Builds, runs, and tests the passthrough_dma example", - ) - parser.add_argument( - "-v", - "--verbose", - action="store_true", - ) - parser.add_argument( - "-p", - "--print-module-only", - action="store_true", - ) + parser = make_air_parser("Builds, runs, and tests the passthrough_dma example") parser.add_argument( "--n", type=int, @@ -184,22 +137,6 @@ def herd_body( default="aie2", help="Target AIE architecture (aie2 or aie2p)", ) - parser.add_argument( - "--compile-mode", - type=str, - choices=["compile-only", "compile-and-run"], - dest="compile_mode", - default="compile-and-run", - help="Configure to whether to run after compile", - ) - parser.add_argument( - "--output-format", - type=str, - choices=["xclbin", "elf"], - default="xclbin", - dest="output_format", - help="Output format for the compiled binary (default: xclbin)", - ) args = parser.parse_args() @@ -210,62 +147,21 @@ def herd_body( args.arch, args.vector_size, ) - if args.print_module_only: - print(mlir_module) - exit(0) + check_print_module(mlir_module, args) # Generate input values in a safe range for exp operation to avoid overflow - # Using values between -5 and 5 to ensure exp(x) stays within bfloat16 range input_a = np.random.uniform(-5, 5, args.n).astype(INPUT_DATATYPE) - if args.compile_mode == "compile-and-run": - - # Stochastically sample num_sample results, and pass to XRTRunner backend for verification. - num_samples = 100 - sampled_indices = np.vstack( - [ - np.random.randint(0, args.n, num_samples), # i indices - ] - ) - - # Compute reference results for sampled indices - sampled_values = np.array( - [np.exp(input_a[i]) for i in zip(*sampled_indices)], - dtype=INPUT_DATATYPE, - ) - - # Store as a dictionary - sampled_data = { - "shape": (args.n,), - "indices": sampled_indices, - "values": sampled_values, - } - - ###### Compile and test - runner = XRTRunner( - verbose=args.verbose, - omit_while_true_loop=False, - output_format=args.output_format, + sampled_data = stochastic_check( + [input_a], args.n, lambda x: np.exp(x), INPUT_DATATYPE + ) + exit( + run_on_npu( + args, + mlir_module, + inputs=[input_a], instance_name="vector_exp", - runtime_loop_tiling_sizes=[4, 4], + stochastic_expected_outputs=[sampled_data], + rtol=1e-1, ) - exit( - runner.run_test( - mlir_module, - inputs=[input_a], - stochastic_expected_outputs=[sampled_data], - rtol=1e-1, - ) - ) - - elif args.compile_mode == "compile-only": - ###### Compile only - backend = XRTBackend( - verbose=args.verbose, - omit_while_true_loop=False, - output_format=args.output_format, - runtime_loop_tiling_sizes=[4, 4], - ) - module_function = backend.compile(mlir_module) - - backend.unload() + ) diff --git a/programming_examples/primitives/vector_examples/vector_fma/vector_fma.py b/programming_examples/primitives/vector_examples/vector_fma/vector_fma.py index 660294b39..59f2bb133 100644 --- a/programming_examples/primitives/vector_examples/vector_fma/vector_fma.py +++ b/programming_examples/primitives/vector_examples/vector_fma/vector_fma.py @@ -1,19 +1,38 @@ # Copyright (C) 2026, Advanced Micro Devices, Inc. # SPDX-License-Identifier: MIT -import argparse +import os +import sys + +sys.path.insert( + 0, + os.path.dirname( + os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + ), +) + from ml_dtypes import bfloat16 from air.ir import * -from air.dialects.affine import apply as affine_apply from air.dialects.air import * from air.dialects import arith from air.dialects.arith import ConstantOp -from air.dialects.memref import AllocOp, DeallocOp, subview -from air.dialects.vector import transfer_read, transfer_write, BroadcastOp, fma +from air.dialects.memref import AllocOp, DeallocOp +from air.dialects.vector import BroadcastOp, fma from air.dialects.func import FuncOp from air.dialects.scf import for_, yield_ -from air.backend.xrt_runner import XRTRunner, type_mapper -from air.backend.xrt import XRTBackend +from air.backend.xrt_runner import type_mapper +from utils import ( + make_l1_memref, + make_vec_type, + identity_map_1d, + tiled_1d_offset, + vec_read, + vec_write, + make_air_parser, + run_on_npu, + stochastic_check, + check_print_module, +) import numpy as np @@ -30,18 +49,10 @@ def build_module(n, tile_n, np_dtype_in, alpha=2.0, vector_size=16): VECTOR_SIZE = vector_size index_type = IndexType.get() - # L3 MemRefTypes l3memrefTy = MemRefType.get([n], xrt_dtype_in) - - # L1 MemRefTypes - l1MemrefTy = MemRefType.get( - shape=[tile_n], - element_type=xrt_dtype_in, - memory_space=IntegerAttr.get(T.i32(), MemorySpace.L1), - ) - - vecTy = VectorType.get([VECTOR_SIZE], xrt_dtype_in) - identity_map = AffineMapAttr.get(AffineMap.get_identity(1)) + l1MemrefTy = make_l1_memref([tile_n], xrt_dtype_in) + vecTy = make_vec_type(VECTOR_SIZE, xrt_dtype_in) + imap = identity_map_1d() @FuncOp.from_py_func(l3memrefTy, l3memrefTy, l3memrefTy) def vector_fma(arg0, arg1, arg2): @@ -67,21 +78,7 @@ def herd_body( l1_out_data = AllocOp(l1MemrefTy, [], []) for _l_ivx in range_(0, n, tile_n * num_tiles): - - offset_map = AffineMap.get( - 0, - 2, - [ - AffineExpr.get_add( - AffineSymbolExpr.get(0), - AffineExpr.get_mul( - AffineSymbolExpr.get(1), - AffineConstantExpr.get(tile_n), - ), - ) - ], - ) - offset = affine_apply(offset_map, [_l_ivx, _ty]) + offset = tiled_1d_offset(_l_ivx, _ty, tile_n) dma_memcpy_nd( l1_b_data, @@ -108,16 +105,12 @@ def herd_body( v_a = BroadcastOp(vecTy, a_const) for j in range_(c0, cTileN, cVecSize): - sub_b = subview(l1_b_data.result, [j], [VECTOR_SIZE], [1]) - sub_c = subview(l1_c_data.result, [j], [VECTOR_SIZE], [1]) - sub_out = subview(l1_out_data.result, [j], [VECTOR_SIZE], [1]) - - v_b = transfer_read(vecTy, sub_b, [c0], identity_map, cst0, [True]) - v_c = transfer_read(vecTy, sub_c, [c0], identity_map, cst0, [True]) + v_b = vec_read(l1_b_data, j, VECTOR_SIZE, c0, vecTy, cst0, imap) + v_c = vec_read(l1_c_data, j, VECTOR_SIZE, c0, vecTy, cst0, imap) # alpha * b + c via vector.fma v_result = fma(v_a, v_b, v_c) - transfer_write(None, v_result, sub_out, [c0], identity_map, [True]) + vec_write(v_result, l1_out_data, j, VECTOR_SIZE, c0, imap) yield_([]) dma_memcpy_nd( @@ -141,12 +134,7 @@ def herd_body( INPUT_DATATYPE = bfloat16 ALPHA = 2.0 - parser = argparse.ArgumentParser( - prog="run.py", - description="Builds, runs, and tests the vector_fma example", - ) - parser.add_argument("-v", "--verbose", action="store_true") - parser.add_argument("-p", "--print-module-only", action="store_true") + parser = make_air_parser("Builds, runs, and tests the vector_fma example") parser.add_argument("--n", type=int, default=N, help="Total number of elements") parser.add_argument("--tile-n", type=int, default=TILE_N, help="Tile size") parser.add_argument( @@ -158,20 +146,6 @@ def herd_body( default=VECTOR_SIZE, help="Vector size for SIMD operations", ) - parser.add_argument( - "--compile-mode", - type=str, - choices=["compile-only", "compile-and-run"], - dest="compile_mode", - default="compile-and-run", - ) - parser.add_argument( - "--output-format", - type=str, - choices=["xclbin", "elf"], - default="xclbin", - dest="output_format", - ) parser.add_argument( "--bf16-emulation", dest="bf16_emulation", @@ -189,51 +163,26 @@ def herd_body( mlir_module = build_module( args.n, args.tile_n, INPUT_DATATYPE, args.alpha, args.vector_size ) - if args.print_module_only: - print(mlir_module) - exit(0) + check_print_module(mlir_module, args) input_b = np.random.uniform(-10.0, 10.0, args.n).astype(INPUT_DATATYPE) input_c = np.random.uniform(-10.0, 10.0, args.n).astype(INPUT_DATATYPE) - if args.compile_mode == "compile-and-run": - num_samples = 100 - sampled_indices = np.vstack([np.random.randint(0, args.n, num_samples)]) - sampled_values = np.array( - [args.alpha * input_b[i] + input_c[i] for i in zip(*sampled_indices)], - dtype=INPUT_DATATYPE, - ) - sampled_data = { - "shape": (args.n,), - "indices": sampled_indices, - "values": sampled_values, - } - - runner = XRTRunner( - verbose=args.verbose, - omit_while_true_loop=False, - output_format=args.output_format, + sampled_data = stochastic_check( + [input_b, input_c], + args.n, + lambda b, c: args.alpha * b + c, + INPUT_DATATYPE, + ) + exit( + run_on_npu( + args, + mlir_module, + inputs=[input_b, input_c], instance_name="vector_fma", + stochastic_expected_outputs=[sampled_data], + rtol=2e-1 if bf16_emulation else 1e-2, + atol=5e-2 if bf16_emulation else 1e-8, bf16_emulation=bf16_emulation, - runtime_loop_tiling_sizes=[4, 4], ) - exit( - runner.run_test( - mlir_module, - inputs=[input_b, input_c], - stochastic_expected_outputs=[sampled_data], - rtol=2e-1 if bf16_emulation else 1e-2, - atol=5e-2 if bf16_emulation else 1e-8, - ) - ) - - elif args.compile_mode == "compile-only": - backend = XRTBackend( - verbose=args.verbose, - omit_while_true_loop=False, - output_format=args.output_format, - bf16_emulation=bf16_emulation, - runtime_loop_tiling_sizes=[4, 4], - ) - module_function = backend.compile(mlir_module) - backend.unload() + ) diff --git a/programming_examples/primitives/vector_examples/vector_max/vector_max.py b/programming_examples/primitives/vector_examples/vector_max/vector_max.py index d4a622cd2..067253e65 100644 --- a/programming_examples/primitives/vector_examples/vector_max/vector_max.py +++ b/programming_examples/primitives/vector_examples/vector_max/vector_max.py @@ -1,18 +1,36 @@ # Copyright (C) 2026, Advanced Micro Devices, Inc. # SPDX-License-Identifier: MIT -import argparse +import os +import sys + +sys.path.insert( + 0, + os.path.dirname( + os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + ), +) + from ml_dtypes import bfloat16 from air.ir import * -from air.dialects.affine import apply as affine_apply from air.dialects.air import * from air.dialects.arith import ConstantOp, MaximumFOp -from air.dialects.memref import AllocOp, DeallocOp, load, store, subview -from air.dialects.vector import transfer_read, transfer_write +from air.dialects.memref import AllocOp, DeallocOp from air.dialects.func import FuncOp from air.dialects.scf import for_, yield_ -from air.backend.xrt_runner import XRTRunner, type_mapper -from air.backend.xrt import XRTBackend +from air.backend.xrt_runner import type_mapper +from utils import ( + make_l1_memref, + make_vec_type, + identity_map_1d, + tiled_1d_offset, + vec_read, + vec_write, + make_air_parser, + run_on_npu, + stochastic_check, + check_print_module, +) import numpy as np @@ -24,23 +42,16 @@ @module_builder def build_module(n, tile_n, np_dtype_in, vector_size=16): a_size = [n] - b_size = a_size - out_size = a_size xrt_dtype_in = type_mapper(np_dtype_in) num_tiles = 2 assert n % (tile_n * num_tiles) == 0 VECTOR_SIZE = vector_size index_type = IndexType.get() - # L3 MemRefTypes l3memrefTy = MemRefType.get(a_size, xrt_dtype_in) - - # L1 MemRefTypes - l1MemrefTy = MemRefType.get( - shape=[tile_n], - element_type=xrt_dtype_in, - memory_space=IntegerAttr.get(T.i32(), MemorySpace.L1), - ) + l1MemrefTy = make_l1_memref([tile_n], xrt_dtype_in) + vecTy = make_vec_type(VECTOR_SIZE, xrt_dtype_in) + imap = identity_map_1d() @FuncOp.from_py_func(l3memrefTy, l3memrefTy, l3memrefTy) def vector_max(arg0, arg1, arg2): @@ -63,97 +74,37 @@ def herd_body( l1_out_data = AllocOp(l1MemrefTy, [], []) for _l_ivx in range_(0, n, tile_n * num_tiles): - - offset_map = AffineMap.get( - 0, - 2, - [ - AffineExpr.get_add( - AffineSymbolExpr.get(0), - AffineExpr.get_mul( - AffineSymbolExpr.get(1), - AffineConstantExpr.get(tile_n), - ), - ) - ], - ) - offset = affine_apply(offset_map, [_l_ivx, _ty]) + offset = tiled_1d_offset(_l_ivx, _ty, tile_n) dma_memcpy_nd( l1_a_data, _l3_a, - src_offsets=[ - offset, - ], + src_offsets=[offset], src_sizes=[tile_n], src_strides=[1], ) dma_memcpy_nd( l1_b_data, _l3_b, - src_offsets=[ - offset, - ], + src_offsets=[offset], src_sizes=[tile_n], src_strides=[1], ) c0 = ConstantOp(index_type, 0) - c1 = ConstantOp(index_type, 1) cVecSize = ConstantOp(index_type, VECTOR_SIZE) cTileN = ConstantOp(index_type, tile_n) + cst0 = arith.ConstantOp(xrt_dtype_in, 0.0) for j in range_(c0, cTileN, cVecSize): - sub_a_vec = subview( - l1_a_data.result, - [j], - [VECTOR_SIZE], - [1], - ) - sub_b_vec = subview( - l1_b_data.result, - [j], - [VECTOR_SIZE], - [1], - ) - sub_c_vec = subview( - l1_out_data.result, - [j], - [VECTOR_SIZE], - [1], - ) - cst0 = arith.ConstantOp(xrt_dtype_in, 0.0) - v_a = transfer_read( - VectorType.get([VECTOR_SIZE], xrt_dtype_in), - sub_a_vec, - [c0], - AffineMapAttr.get(AffineMap.get_identity(1)), - cst0, - [True], - ) - v_b = transfer_read( - VectorType.get([VECTOR_SIZE], xrt_dtype_in), - sub_b_vec, - [c0], - AffineMapAttr.get(AffineMap.get_identity(1)), - cst0, - [True], - ) + v_a = vec_read(l1_a_data, j, VECTOR_SIZE, c0, vecTy, cst0, imap) + v_b = vec_read(l1_b_data, j, VECTOR_SIZE, c0, vecTy, cst0, imap) v_c = MaximumFOp(v_a, v_b) - transfer_write( - None, - v_c, - sub_c_vec, - [c0], - AffineMapAttr.get(AffineMap.get_identity(1)), - [True], - ) + vec_write(v_c, l1_out_data, j, VECTOR_SIZE, c0, imap) yield_([]) dma_memcpy_nd( _l3_c, l1_out_data, - dst_offsets=[ - offset, - ], + dst_offsets=[offset], dst_sizes=[tile_n], dst_strides=[1], ) @@ -165,26 +116,12 @@ def herd_body( if __name__ == "__main__": - # Default values. N = 65536 TILE_N = 1024 VECTOR_SIZE = 16 INPUT_DATATYPE = bfloat16 - parser = argparse.ArgumentParser( - prog="run.py", - description="Builds, runs, and tests the vector_max example", - ) - parser.add_argument( - "-v", - "--verbose", - action="store_true", - ) - parser.add_argument( - "-p", - "--print-module-only", - action="store_true", - ) + parser = make_air_parser("Builds, runs, and tests the vector_max example") parser.add_argument( "--n", type=int, @@ -198,22 +135,6 @@ def herd_body( default=VECTOR_SIZE, help="Vector size for SIMD operations", ) - parser.add_argument( - "--compile-mode", - type=str, - choices=["compile-only", "compile-and-run"], - dest="compile_mode", - default="compile-and-run", - help="Configure to whether to run after compile", - ) - parser.add_argument( - "--output-format", - type=str, - choices=["xclbin", "elf"], - default="xclbin", - dest="output_format", - help="Output format for the compiled binary (default: xclbin)", - ) parser.add_argument( "--bf16-emulation", dest="bf16_emulation", @@ -234,63 +155,25 @@ def herd_body( INPUT_DATATYPE, args.vector_size, ) - if args.print_module_only: - print(mlir_module) - exit(0) + check_print_module(mlir_module, args) input_a = np.random.uniform(-100.0, 100.0, args.n).astype(INPUT_DATATYPE) input_b = np.random.uniform(-100.0, 100.0, args.n).astype(INPUT_DATATYPE) - if args.compile_mode == "compile-and-run": - - # Stochastically sample num_sample results, and pass to XRTRunner backend for verification. - num_samples = 100 - sampled_indices = np.vstack( - [ - np.random.randint(0, args.n, num_samples), # i indices - ] - ) - - # Compute reference results for sampled indices - sampled_values = np.array( - [max(input_a[i], input_b[i]) for i in zip(*sampled_indices)], - dtype=INPUT_DATATYPE, - ) - - # Store as a dictionary - sampled_data = { - "shape": (args.n,), - "indices": sampled_indices, - "values": sampled_values, - } - - ###### Compile and test - runner = XRTRunner( - verbose=args.verbose, - omit_while_true_loop=False, - output_format=args.output_format, + sampled_data = stochastic_check( + [input_a, input_b], + args.n, + lambda a, b: max(a, b), + INPUT_DATATYPE, + ) + exit( + run_on_npu( + args, + mlir_module, + inputs=[input_a, input_b], instance_name="vector_max", + stochastic_expected_outputs=[sampled_data], + rtol=5e-2 if bf16_emulation else 1e-3, bf16_emulation=bf16_emulation, - runtime_loop_tiling_sizes=[4, 4], - ) - exit( - runner.run_test( - mlir_module, - inputs=[input_a, input_b], - stochastic_expected_outputs=[sampled_data], - rtol=5e-2 if bf16_emulation else 1e-3, - ) - ) - - elif args.compile_mode == "compile-only": - ###### Compile only - backend = XRTBackend( - verbose=args.verbose, - omit_while_true_loop=False, - output_format=args.output_format, - bf16_emulation=bf16_emulation, - runtime_loop_tiling_sizes=[4, 4], ) - module_function = backend.compile(mlir_module) - - backend.unload() + ) diff --git a/programming_examples/primitives/vector_examples/vector_mul/vector_mul.py b/programming_examples/primitives/vector_examples/vector_mul/vector_mul.py index 67bfa2248..0b8e44851 100644 --- a/programming_examples/primitives/vector_examples/vector_mul/vector_mul.py +++ b/programming_examples/primitives/vector_examples/vector_mul/vector_mul.py @@ -1,18 +1,36 @@ # Copyright (C) 2025, Advanced Micro Devices, Inc. # SPDX-License-Identifier: MIT -import argparse +import os +import sys + +sys.path.insert( + 0, + os.path.dirname( + os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + ), +) + from ml_dtypes import bfloat16 from air.ir import * -from air.dialects.affine import apply as affine_apply from air.dialects.air import * from air.dialects.arith import ConstantOp -from air.dialects.memref import AllocOp, DeallocOp, load, store, subview -from air.dialects.vector import transfer_read, transfer_write +from air.dialects.memref import AllocOp, DeallocOp from air.dialects.func import FuncOp from air.dialects.scf import for_, yield_ -from air.backend.xrt_runner import XRTRunner, type_mapper -from air.backend.xrt import XRTBackend +from air.backend.xrt_runner import type_mapper +from utils import ( + make_l1_memref, + make_vec_type, + identity_map_1d, + tiled_1d_offset, + vec_read, + vec_write, + make_air_parser, + run_on_npu, + stochastic_check, + check_print_module, +) import numpy as np @@ -24,8 +42,6 @@ @module_builder def build_module(n, tile_n, np_dtype_in, arch="aie2"): a_size = [n] - b_size = a_size - out_size = a_size xrt_dtype_in = type_mapper(np_dtype_in) num_tiles = 2 assert n % (tile_n * num_tiles) == 0 @@ -37,15 +53,10 @@ def build_module(n, tile_n, np_dtype_in, arch="aie2"): VECTOR_SIZE = arch_vector_sizes.get(arch, 16) # default to 16 if unknown index_type = IndexType.get() - # L3 MemRefTypes l3memrefTy = MemRefType.get(a_size, xrt_dtype_in) - - # L1 MemRefTypes - l1MemrefTy = MemRefType.get( - shape=[tile_n], - element_type=xrt_dtype_in, - memory_space=IntegerAttr.get(T.i32(), MemorySpace.L1), - ) + l1MemrefTy = make_l1_memref([tile_n], xrt_dtype_in) + vecTy = make_vec_type(VECTOR_SIZE, xrt_dtype_in) + imap = identity_map_1d() @FuncOp.from_py_func(l3memrefTy, l3memrefTy, l3memrefTy) def vector_mul(arg0, arg1, arg2): @@ -68,97 +79,37 @@ def herd_body( l1_out_data = AllocOp(l1MemrefTy, [], []) for _l_ivx in range_(0, n, tile_n * num_tiles): - - offset_map = AffineMap.get( - 0, - 2, - [ - AffineExpr.get_add( - AffineSymbolExpr.get(0), - AffineExpr.get_mul( - AffineSymbolExpr.get(1), - AffineConstantExpr.get(tile_n), - ), - ) - ], - ) - offset = affine_apply(offset_map, [_l_ivx, _ty]) + offset = tiled_1d_offset(_l_ivx, _ty, tile_n) dma_memcpy_nd( l1_a_data, _l3_a, - src_offsets=[ - offset, - ], + src_offsets=[offset], src_sizes=[tile_n], src_strides=[1], ) dma_memcpy_nd( l1_b_data, _l3_b, - src_offsets=[ - offset, - ], + src_offsets=[offset], src_sizes=[tile_n], src_strides=[1], ) c0 = ConstantOp(index_type, 0) - c1 = ConstantOp(index_type, 1) cVecSize = ConstantOp(index_type, VECTOR_SIZE) cTileN = ConstantOp(index_type, tile_n) + cst0 = arith.ConstantOp(xrt_dtype_in, 0.0) for j in range_(c0, cTileN, cVecSize): - sub_a_vec = subview( - l1_a_data.result, - [j], - [VECTOR_SIZE], - [1], - ) - sub_b_vec = subview( - l1_b_data.result, - [j], - [VECTOR_SIZE], - [1], - ) - sub_c_vec = subview( - l1_out_data.result, - [j], - [VECTOR_SIZE], - [1], - ) - cst0 = arith.ConstantOp(xrt_dtype_in, 0.0) - v_a = transfer_read( - VectorType.get([VECTOR_SIZE], xrt_dtype_in), - sub_a_vec, - [c0], - AffineMapAttr.get(AffineMap.get_identity(1)), - cst0, - [True], - ) - v_b = transfer_read( - VectorType.get([VECTOR_SIZE], xrt_dtype_in), - sub_b_vec, - [c0], - AffineMapAttr.get(AffineMap.get_identity(1)), - cst0, - [True], - ) + v_a = vec_read(l1_a_data, j, VECTOR_SIZE, c0, vecTy, cst0, imap) + v_b = vec_read(l1_b_data, j, VECTOR_SIZE, c0, vecTy, cst0, imap) v_c = arith.MulFOp(v_a, v_b) - transfer_write( - None, - v_c, - sub_c_vec, - [c0], - AffineMapAttr.get(AffineMap.get_identity(1)), - [True], - ) + vec_write(v_c, l1_out_data, j, VECTOR_SIZE, c0, imap) yield_([]) dma_memcpy_nd( _l3_c, l1_out_data, - dst_offsets=[ - offset, - ], + dst_offsets=[offset], dst_sizes=[tile_n], dst_strides=[1], ) @@ -170,25 +121,11 @@ def herd_body( if __name__ == "__main__": - # Default values. N = 65536 TILE_N = 1024 INPUT_DATATYPE = bfloat16 - parser = argparse.ArgumentParser( - prog="run.py", - description="Builds, runs, and tests the passthrough_dma example", - ) - parser.add_argument( - "-v", - "--verbose", - action="store_true", - ) - parser.add_argument( - "-p", - "--print-module-only", - action="store_true", - ) + parser = make_air_parser("Builds, runs, and tests the passthrough_dma example") parser.add_argument( "--n", type=int, @@ -203,22 +140,6 @@ def herd_body( default="aie2", help="Target AIE architecture (aie2 or aie2p)", ) - parser.add_argument( - "--compile-mode", - type=str, - choices=["compile-only", "compile-and-run"], - dest="compile_mode", - default="compile-and-run", - help="Configure to whether to run after compile", - ) - parser.add_argument( - "--output-format", - type=str, - choices=["xclbin", "elf"], - default="xclbin", - dest="output_format", - help="Output format for the compiled binary (default: xclbin)", - ) parser.add_argument( "--bf16-emulation", dest="bf16_emulation", @@ -239,65 +160,24 @@ def herd_body( INPUT_DATATYPE, args.arch, ) - if args.print_module_only: - print(mlir_module) - exit(0) + check_print_module(mlir_module, args) input_a = np.arange(0, args.n, dtype=np.int64).reshape(args.n) input_a = input_a.astype(INPUT_DATATYPE) input_b = np.arange(0, args.n, dtype=np.int64).reshape(args.n) input_b = input_b.astype(INPUT_DATATYPE) - if args.compile_mode == "compile-and-run": - - # Stochastically sample num_sample results, and pass to XRTRunner backend for verification. - num_samples = 100 - sampled_indices = np.vstack( - [ - np.random.randint(0, args.n, num_samples), # i indices - ] - ) - - # Compute reference results for sampled indices - sampled_values = np.array( - [input_a[i] * input_b[i] for i in zip(*sampled_indices)], - dtype=INPUT_DATATYPE, - ) - - # Store as a dictionary - sampled_data = { - "shape": (args.n,), - "indices": sampled_indices, - "values": sampled_values, - } - - ###### Compile and test - runner = XRTRunner( - verbose=args.verbose, - omit_while_true_loop=False, - output_format=args.output_format, + sampled_data = stochastic_check( + [input_a, input_b], args.n, lambda a, b: a * b, INPUT_DATATYPE + ) + exit( + run_on_npu( + args, + mlir_module, + inputs=[input_a, input_b], instance_name="vector_mul", + stochastic_expected_outputs=[sampled_data], + rtol=5e-2 if bf16_emulation else 1e-2, bf16_emulation=bf16_emulation, - runtime_loop_tiling_sizes=[4, 4], - ) - exit( - runner.run_test( - mlir_module, - inputs=[input_a, input_b], - stochastic_expected_outputs=[sampled_data], - rtol=5e-2 if bf16_emulation else 1e-2, - ) - ) - - elif args.compile_mode == "compile-only": - ###### Compile only - backend = XRTBackend( - verbose=args.verbose, - omit_while_true_loop=False, - output_format=args.output_format, - bf16_emulation=bf16_emulation, - runtime_loop_tiling_sizes=[4, 4], ) - module_function = backend.compile(mlir_module) - - backend.unload() + ) diff --git a/programming_examples/primitives/vector_examples/vector_muladd/vector_muladd.py b/programming_examples/primitives/vector_examples/vector_muladd/vector_muladd.py index 73dd547ad..27f40e815 100644 --- a/programming_examples/primitives/vector_examples/vector_muladd/vector_muladd.py +++ b/programming_examples/primitives/vector_examples/vector_muladd/vector_muladd.py @@ -1,19 +1,38 @@ # Copyright (C) 2026, Advanced Micro Devices, Inc. # SPDX-License-Identifier: MIT -import argparse +import os +import sys + +sys.path.insert( + 0, + os.path.dirname( + os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + ), +) + from ml_dtypes import bfloat16 from air.ir import * -from air.dialects.affine import apply as affine_apply from air.dialects.air import * from air.dialects import arith from air.dialects.arith import ConstantOp, mulf, addf -from air.dialects.memref import AllocOp, DeallocOp, subview -from air.dialects.vector import transfer_read, transfer_write, BroadcastOp +from air.dialects.memref import AllocOp, DeallocOp +from air.dialects.vector import BroadcastOp from air.dialects.func import FuncOp from air.dialects.scf import for_, yield_ -from air.backend.xrt_runner import XRTRunner, type_mapper -from air.backend.xrt import XRTBackend +from air.backend.xrt_runner import type_mapper +from utils import ( + make_l1_memref, + make_vec_type, + identity_map_1d, + tiled_1d_offset, + vec_read, + vec_write, + make_air_parser, + run_on_npu, + stochastic_check, + check_print_module, +) import numpy as np @@ -30,18 +49,10 @@ def build_module(n, tile_n, np_dtype_in, alpha=2.0, vector_size=16): VECTOR_SIZE = vector_size index_type = IndexType.get() - # L3 MemRefTypes l3memrefTy = MemRefType.get([n], xrt_dtype_in) - - # L1 MemRefTypes - l1MemrefTy = MemRefType.get( - shape=[tile_n], - element_type=xrt_dtype_in, - memory_space=IntegerAttr.get(T.i32(), MemorySpace.L1), - ) - - vecTy = VectorType.get([VECTOR_SIZE], xrt_dtype_in) - identity_map = AffineMapAttr.get(AffineMap.get_identity(1)) + l1MemrefTy = make_l1_memref([tile_n], xrt_dtype_in) + vecTy = make_vec_type(VECTOR_SIZE, xrt_dtype_in) + imap = identity_map_1d() @FuncOp.from_py_func(l3memrefTy, l3memrefTy, l3memrefTy) def vector_muladd(arg0, arg1, arg2): @@ -67,21 +78,7 @@ def herd_body( l1_out_data = AllocOp(l1MemrefTy, [], []) for _l_ivx in range_(0, n, tile_n * num_tiles): - - offset_map = AffineMap.get( - 0, - 2, - [ - AffineExpr.get_add( - AffineSymbolExpr.get(0), - AffineExpr.get_mul( - AffineSymbolExpr.get(1), - AffineConstantExpr.get(tile_n), - ), - ) - ], - ) - offset = affine_apply(offset_map, [_l_ivx, _ty]) + offset = tiled_1d_offset(_l_ivx, _ty, tile_n) dma_memcpy_nd( l1_b_data, @@ -108,18 +105,14 @@ def herd_body( v_a = BroadcastOp(vecTy, a_const).result for j in range_(c0, cTileN, cVecSize): - sub_b = subview(l1_b_data.result, [j], [VECTOR_SIZE], [1]) - sub_c = subview(l1_c_data.result, [j], [VECTOR_SIZE], [1]) - sub_out = subview(l1_out_data.result, [j], [VECTOR_SIZE], [1]) - - v_b = transfer_read(vecTy, sub_b, [c0], identity_map, cst0, [True]) - v_c = transfer_read(vecTy, sub_c, [c0], identity_map, cst0, [True]) + v_b = vec_read(l1_b_data, j, VECTOR_SIZE, c0, vecTy, cst0, imap) + v_c = vec_read(l1_c_data, j, VECTOR_SIZE, c0, vecTy, cst0, imap) # alpha * b + c via separate arith.mulf + arith.addf # The aievec pass fuses this into aievec.mac_elem (PR #2896) v_ab = mulf(v_a, v_b) v_result = addf(v_ab, v_c) - transfer_write(None, v_result, sub_out, [c0], identity_map, [True]) + vec_write(v_result, l1_out_data, j, VECTOR_SIZE, c0, imap) yield_([]) dma_memcpy_nd( @@ -143,12 +136,7 @@ def herd_body( INPUT_DATATYPE = bfloat16 ALPHA = 2.0 - parser = argparse.ArgumentParser( - prog="run.py", - description="Builds, runs, and tests the vector_muladd example", - ) - parser.add_argument("-v", "--verbose", action="store_true") - parser.add_argument("-p", "--print-module-only", action="store_true") + parser = make_air_parser("Builds, runs, and tests the vector_muladd example") parser.add_argument("--n", type=int, default=N, help="Total number of elements") parser.add_argument("--tile-n", type=int, default=TILE_N, help="Tile size") parser.add_argument( @@ -160,68 +148,30 @@ def herd_body( default=VECTOR_SIZE, help="Vector size for SIMD operations", ) - parser.add_argument( - "--compile-mode", - type=str, - choices=["compile-only", "compile-and-run"], - dest="compile_mode", - default="compile-and-run", - ) - parser.add_argument( - "--output-format", - type=str, - choices=["xclbin", "elf"], - default="xclbin", - dest="output_format", - ) args = parser.parse_args() mlir_module = build_module( args.n, args.tile_n, INPUT_DATATYPE, args.alpha, args.vector_size ) - if args.print_module_only: - print(mlir_module) - exit(0) + check_print_module(mlir_module, args) input_b = np.random.uniform(-10.0, 10.0, args.n).astype(INPUT_DATATYPE) input_c = np.random.uniform(-10.0, 10.0, args.n).astype(INPUT_DATATYPE) - if args.compile_mode == "compile-and-run": - num_samples = 100 - sampled_indices = np.vstack([np.random.randint(0, args.n, num_samples)]) - sampled_values = np.array( - [args.alpha * input_b[i] + input_c[i] for i in zip(*sampled_indices)], - dtype=INPUT_DATATYPE, - ) - sampled_data = { - "shape": (args.n,), - "indices": sampled_indices, - "values": sampled_values, - } - - runner = XRTRunner( - verbose=args.verbose, - omit_while_true_loop=False, - output_format=args.output_format, + sampled_data = stochastic_check( + [input_b, input_c], + args.n, + lambda b, c: args.alpha * b + c, + INPUT_DATATYPE, + ) + exit( + run_on_npu( + args, + mlir_module, + inputs=[input_b, input_c], instance_name="vector_muladd", - runtime_loop_tiling_sizes=[4, 4], + stochastic_expected_outputs=[sampled_data], + rtol=1e-2, ) - exit( - runner.run_test( - mlir_module, - inputs=[input_b, input_c], - stochastic_expected_outputs=[sampled_data], - rtol=1e-2, - ) - ) - - elif args.compile_mode == "compile-only": - backend = XRTBackend( - verbose=args.verbose, - omit_while_true_loop=False, - output_format=args.output_format, - runtime_loop_tiling_sizes=[4, 4], - ) - module_function = backend.compile(mlir_module) - backend.unload() + ) diff --git a/programming_examples/primitives/vector_examples/vector_reciprocal/vector_reciprocal.py b/programming_examples/primitives/vector_examples/vector_reciprocal/vector_reciprocal.py index 000bcbd6d..0edc7de27 100644 --- a/programming_examples/primitives/vector_examples/vector_reciprocal/vector_reciprocal.py +++ b/programming_examples/primitives/vector_examples/vector_reciprocal/vector_reciprocal.py @@ -1,18 +1,36 @@ # Copyright (C) 2026, Advanced Micro Devices, Inc. # SPDX-License-Identifier: MIT -import argparse +import os +import sys + +sys.path.insert( + 0, + os.path.dirname( + os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + ), +) + import numpy as np from air.ir import * -from air.dialects.affine import apply as affine_apply from air.dialects.air import * from air.dialects.arith import ConstantOp -from air.dialects.memref import AllocOp, DeallocOp, subview -from air.dialects.vector import transfer_read, transfer_write, broadcast +from air.dialects.memref import AllocOp, DeallocOp +from air.dialects.vector import broadcast from air.dialects.func import FuncOp from air.dialects.scf import for_, yield_ -from air.backend.xrt_runner import XRTRunner, type_mapper -from air.backend.xrt import XRTBackend +from air.backend.xrt_runner import type_mapper +from utils import ( + make_l1_memref, + make_vec_type, + identity_map_1d, + tiled_1d_offset, + vec_read, + vec_write, + make_air_parser, + run_on_npu, + check_print_module, +) range_ = for_ @@ -20,7 +38,6 @@ @module_builder def build_module(n, tile_n, np_dtype_in, arch="aie2"): a_size = [n] - out_size = a_size xrt_dtype_in = type_mapper(np_dtype_in) num_tiles = 2 assert n % (tile_n * num_tiles) == 0 @@ -32,15 +49,10 @@ def build_module(n, tile_n, np_dtype_in, arch="aie2"): VECTOR_SIZE = arch_vector_sizes.get(arch, 16) # default to 16 if unknown index_type = IndexType.get() - # L3 MemRefTypes l3memrefTy = MemRefType.get(a_size, xrt_dtype_in) - - # L1 MemRefTypes - l1MemrefTy = MemRefType.get( - shape=[tile_n], - element_type=xrt_dtype_in, - memory_space=IntegerAttr.get(T.i32(), MemorySpace.L1), - ) + l1MemrefTy = make_l1_memref([tile_n], xrt_dtype_in) + vecTy = make_vec_type(VECTOR_SIZE, xrt_dtype_in) + imap = identity_map_1d() @FuncOp.from_py_func(l3memrefTy, l3memrefTy) def vector_reciprocal(arg0, arg1): @@ -62,28 +74,12 @@ def herd_body( l1_out_data = AllocOp(l1MemrefTy, [], []) for _l_ivx in range_(0, n, tile_n * num_tiles): - - offset_map = AffineMap.get( - 0, - 2, - [ - AffineExpr.get_add( - AffineSymbolExpr.get(0), - AffineExpr.get_mul( - AffineSymbolExpr.get(1), - AffineConstantExpr.get(tile_n), - ), - ) - ], - ) - offset = affine_apply(offset_map, [_l_ivx, _ty]) + offset = tiled_1d_offset(_l_ivx, _ty, tile_n) dma_memcpy_nd( l1_a_data, _l3_a, - src_offsets=[ - offset, - ], + src_offsets=[offset], src_sizes=[tile_n], src_strides=[1], ) @@ -94,51 +90,20 @@ def herd_body( # Create constant 1.0 scalar and broadcast to vector one_scalar = arith.ConstantOp(xrt_dtype_in, 1.0) - one_vector = broadcast( - VectorType.get([VECTOR_SIZE], xrt_dtype_in), - one_scalar, - ) + one_vector = broadcast(vecTy, one_scalar) for j in range_(c0, cTileN, cVecSize): - sub_a_vec = subview( - l1_a_data.result, - [j], - [VECTOR_SIZE], - [1], - ) - sub_c_vec = subview( - l1_out_data.result, - [j], - [VECTOR_SIZE], - [1], - ) cst0 = arith.ConstantOp(xrt_dtype_in, 0.0) - v_a = transfer_read( - VectorType.get([VECTOR_SIZE], xrt_dtype_in), - sub_a_vec, - [c0], - AffineMapAttr.get(AffineMap.get_identity(1)), - cst0, - [True], - ) + v_a = vec_read(l1_a_data, j, VECTOR_SIZE, c0, vecTy, cst0, imap) # Compute reciprocal: 1.0 / a v_c = arith.DivFOp(one_vector, v_a) - transfer_write( - None, - v_c, - sub_c_vec, - [c0], - AffineMapAttr.get(AffineMap.get_identity(1)), - [True], - ) + vec_write(v_c, l1_out_data, j, VECTOR_SIZE, c0, imap) yield_([]) dma_memcpy_nd( _l3_c, l1_out_data, - dst_offsets=[ - offset, - ], + dst_offsets=[offset], dst_sizes=[tile_n], dst_strides=[1], ) @@ -149,24 +114,12 @@ def herd_body( if __name__ == "__main__": - # Default values. N = 65536 TILE_N = 1024 INPUT_DATATYPE = np.float32 - parser = argparse.ArgumentParser( - prog="run.py", - description="Builds, runs, and tests the vector reciprocal (1/x) example", - ) - parser.add_argument( - "-v", - "--verbose", - action="store_true", - ) - parser.add_argument( - "-p", - "--print-module-only", - action="store_true", + parser = make_air_parser( + "Builds, runs, and tests the vector reciprocal (1/x) example" ) parser.add_argument( "--n", @@ -182,14 +135,6 @@ def herd_body( default="aie2", help="Target AIE architecture (aie2 or aie2p)", ) - parser.add_argument( - "--compile-mode", - type=str, - choices=["compile-only", "compile-and-run"], - dest="compile_mode", - default="compile-and-run", - help="Configure to whether to run after compile", - ) args = parser.parse_args() mlir_module = build_module( @@ -198,60 +143,36 @@ def herd_body( INPUT_DATATYPE, args.arch, ) - if args.print_module_only: - print(mlir_module) - exit(0) + check_print_module(mlir_module, args) # Generate random input vector with fixed seed for reproducibility np.random.seed(37) # Use a safe range [1, 10] to avoid division by zero input_a = np.random.uniform(1.0, 10.0, args.n).astype(INPUT_DATATYPE) - if args.compile_mode == "compile-and-run": - - # Stochastically sample num_sample results, and pass to XRTRunner backend for verification. - num_samples = 100 - sampled_indices = np.vstack( - [ - np.random.randint(0, args.n, num_samples), # i indices - ] - ) - - # Compute reference results for sampled indices: 1.0 / x - sampled_values = np.array( - [np.float32(1.0) / np.float32(input_a[i]) for i in sampled_indices[0]], - dtype=INPUT_DATATYPE, - ) - - # Store as a dictionary - sampled_data = { - "shape": (args.n,), - "indices": sampled_indices, - "values": sampled_values, - } - - ###### Compile and test - runner = XRTRunner( - verbose=args.verbose, - omit_while_true_loop=False, - runtime_loop_tiling_sizes=[4, 4], - ) - exit( - runner.run_test( - mlir_module, - inputs=[input_a], - stochastic_expected_outputs=[sampled_data], - rtol=1e-5, - ) - ) - - elif args.compile_mode == "compile-only": - ###### Compile only - backend = XRTBackend( - verbose=args.verbose, - omit_while_true_loop=False, - runtime_loop_tiling_sizes=[4, 4], + num_samples = 100 + sampled_indices = np.vstack( + [ + np.random.randint(0, args.n, num_samples), + ] + ) + # Compute reference results for sampled indices: 1.0 / x + sampled_values = np.array( + [np.float32(1.0) / np.float32(input_a[i]) for i in sampled_indices[0]], + dtype=INPUT_DATATYPE, + ) + sampled_data = { + "shape": (args.n,), + "indices": sampled_indices, + "values": sampled_values, + } + exit( + run_on_npu( + args, + mlir_module, + inputs=[input_a], + instance_name="vector_reciprocal", + stochastic_expected_outputs=[sampled_data], + rtol=1e-5, ) - module_function = backend.compile(mlir_module) - - backend.unload() + ) diff --git a/programming_examples/primitives/vector_examples/vector_reduce_add/vector_reduce_add.py b/programming_examples/primitives/vector_examples/vector_reduce_add/vector_reduce_add.py index 1f6d2862f..03bfd430a 100644 --- a/programming_examples/primitives/vector_examples/vector_reduce_add/vector_reduce_add.py +++ b/programming_examples/primitives/vector_examples/vector_reduce_add/vector_reduce_add.py @@ -1,10 +1,18 @@ # Copyright (C) 2025, Advanced Micro Devices, Inc. # SPDX-License-Identifier: MIT -import argparse +import os +import sys + +sys.path.insert( + 0, + os.path.dirname( + os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + ), +) + from ml_dtypes import bfloat16 from air.ir import * -from air.dialects.affine import apply as affine_apply from air.dialects.air import * from air.dialects.arith import ConstantOp from air.dialects.memref import AllocOp, DeallocOp, load, store, subview, collapse_shape @@ -19,8 +27,17 @@ from air.dialects.func import FuncOp from air.dialects.scf import for_, yield_ from air.dialects.math import exp -from air.backend.xrt_runner import XRTRunner, type_mapper -from air.backend.xrt import XRTBackend +from air.backend.xrt_runner import type_mapper +from air.dialects import arith +from utils import ( + make_l1_memref, + identity_map_1d, + tiled_1d_offset, + make_air_parser, + run_on_npu, + stochastic_check, + check_print_module, +) import numpy as np @@ -43,16 +60,8 @@ def build_module(m, n, tile_m, np_dtype_in): l3outputMemrefTy = MemRefType.get(out_size, xrt_dtype_in) # L1 MemRefTypes - l1MemrefTy = MemRefType.get( - shape=[tile_m, n], - element_type=xrt_dtype_in, - memory_space=IntegerAttr.get(T.i32(), MemorySpace.L1), - ) - l1outputMemrefTy = MemRefType.get( - shape=[tile_m, 1], - element_type=xrt_dtype_in, - memory_space=IntegerAttr.get(T.i32(), MemorySpace.L1), - ) + l1MemrefTy = make_l1_memref([tile_m, n], xrt_dtype_in) + l1outputMemrefTy = make_l1_memref([tile_m, 1], xrt_dtype_in) @FuncOp.from_py_func(l3memrefTy, l3outputMemrefTy) def vector_reduce_add(arg0, arg2): @@ -73,21 +82,7 @@ def herd_body( l1_out_data = AllocOp(l1outputMemrefTy, [], []) for _l_ivx in range_(0, m, tile_m * num_tiles): - - offset_map = AffineMap.get( - 0, - 2, - [ - AffineExpr.get_add( - AffineSymbolExpr.get(0), - AffineExpr.get_mul( - AffineSymbolExpr.get(1), - AffineConstantExpr.get(tile_m), - ), - ) - ], - ) - offset = affine_apply(offset_map, [_l_ivx, _ty]) + offset = tiled_1d_offset(_l_ivx, _ty, tile_m) dma_memcpy_nd( l1_a_data, @@ -142,7 +137,7 @@ def herd_body( VectorType.get([n], xrt_dtype_in), collapse_a, [c0], - AffineMapAttr.get(AffineMap.get_identity(1)), + identity_map_1d(), cst0, [True], ) @@ -166,26 +161,12 @@ def herd_body( if __name__ == "__main__": - # Default values. M = 65536 N = 16 TILE_M = 256 INPUT_DATATYPE = bfloat16 - parser = argparse.ArgumentParser( - prog="run.py", - description="Builds, runs, and tests the passthrough_dma example", - ) - parser.add_argument( - "-v", - "--verbose", - action="store_true", - ) - parser.add_argument( - "-p", - "--print-module-only", - action="store_true", - ) + parser = make_air_parser("Builds, runs, and tests the passthrough_dma example") parser.add_argument( "--m", type=int, @@ -199,22 +180,6 @@ def herd_body( help="Input size (dimension N)", ) parser.add_argument("--tile-m", type=int, default=TILE_M, help="Tile size M") - parser.add_argument( - "--compile-mode", - type=str, - choices=["compile-only", "compile-and-run"], - dest="compile_mode", - default="compile-and-run", - help="Configure to whether to run after compile", - ) - parser.add_argument( - "--output-format", - type=str, - choices=["xclbin", "elf"], - default="xclbin", - dest="output_format", - help="Output format for the compiled binary (default: xclbin)", - ) args = parser.parse_args() @@ -224,58 +189,36 @@ def herd_body( args.tile_m, INPUT_DATATYPE, ) - if args.print_module_only: - print(mlir_module) - exit(0) + check_print_module(mlir_module, args) input_a = np.arange(0, (args.m * args.n), dtype=INPUT_DATATYPE).reshape( args.m, args.n ) - if args.compile_mode == "compile-and-run": - - # Stochastically sample num_sample results, and pass to XRTRunner backend for verification. - num_samples = 100 - sampled_indices = np.vstack([np.random.randint(0, args.m, num_samples)]) - - # Compute reference results for sampled indices - sampled_values = np.array( - [np.sum(input_a[i]) for i in zip(*sampled_indices)], - dtype=INPUT_DATATYPE, - ) + # Stochastically sample num_sample results for verification. + num_samples = 100 + sampled_indices = np.vstack([np.random.randint(0, args.m, num_samples)]) - # Store as a dictionary - sampled_data = { - "shape": (args.m,), - "indices": sampled_indices, - "values": sampled_values, - } + # Compute reference results for sampled indices + sampled_values = np.array( + [np.sum(input_a[i]) for i in zip(*sampled_indices)], + dtype=INPUT_DATATYPE, + ) - ###### Compile and test - runner = XRTRunner( - verbose=args.verbose, - omit_while_true_loop=False, - output_format=args.output_format, + # Store as a dictionary + sampled_data = { + "shape": (args.m,), + "indices": sampled_indices, + "values": sampled_values, + } + + exit( + run_on_npu( + args, + mlir_module, + inputs=[input_a], instance_name="vector_reduce_add", - runtime_loop_tiling_sizes=[4, 4], - ) - exit( - runner.run_test( - mlir_module, - inputs=[input_a], - stochastic_expected_outputs=[sampled_data], - rtol=1e-1, - ) - ) - - elif args.compile_mode == "compile-only": - ###### Compile only - backend = XRTBackend( - verbose=args.verbose, - omit_while_true_loop=False, - output_format=args.output_format, - runtime_loop_tiling_sizes=[4, 4], + stochastic_expected_outputs=[sampled_data], + rtol=1e-1, ) - module_function = backend.compile(mlir_module) - - backend.unload() + ) diff --git a/programming_examples/primitives/vector_examples/vector_reduce_max/vector_reduce_max.py b/programming_examples/primitives/vector_examples/vector_reduce_max/vector_reduce_max.py index 3a1589443..c02bd64dd 100644 --- a/programming_examples/primitives/vector_examples/vector_reduce_max/vector_reduce_max.py +++ b/programming_examples/primitives/vector_examples/vector_reduce_max/vector_reduce_max.py @@ -1,10 +1,18 @@ # Copyright (C) 2025, Advanced Micro Devices, Inc. # SPDX-License-Identifier: MIT -import argparse +import os +import sys + +sys.path.insert( + 0, + os.path.dirname( + os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + ), +) + from ml_dtypes import bfloat16 from air.ir import * -from air.dialects.affine import apply as affine_apply from air.dialects.air import * from air.dialects.arith import ConstantOp from air.dialects.memref import AllocOp, DeallocOp, load, store, subview, collapse_shape @@ -19,8 +27,14 @@ from air.dialects.func import FuncOp from air.dialects.scf import for_, yield_ from air.dialects.math import exp -from air.backend.xrt_runner import XRTRunner, type_mapper -from air.backend.xrt import XRTBackend +from air.backend.xrt_runner import type_mapper +from utils import ( + make_l1_memref, + tiled_1d_offset, + make_air_parser, + run_on_npu, + check_print_module, +) import numpy as np @@ -44,16 +58,8 @@ def build_module(m, n, tile_m, np_dtype_in): l3outputMemrefTy = MemRefType.get(out_size, xrt_dtype_in) # L1 MemRefTypes - l1MemrefTy = MemRefType.get( - shape=[tile_m, n], - element_type=xrt_dtype_in, - memory_space=IntegerAttr.get(T.i32(), MemorySpace.L1), - ) - l1outputMemrefTy = MemRefType.get( - shape=[tile_m, 1], - element_type=xrt_dtype_in, - memory_space=IntegerAttr.get(T.i32(), MemorySpace.L1), - ) + l1MemrefTy = make_l1_memref([tile_m, n], xrt_dtype_in) + l1outputMemrefTy = make_l1_memref([tile_m, 1], xrt_dtype_in) @FuncOp.from_py_func(l3memrefTy, l3outputMemrefTy) def vector_reduce_max(arg0, arg2): @@ -74,21 +80,7 @@ def herd_body( l1_out_data = AllocOp(l1outputMemrefTy, [], []) for _l_ivx in range_(0, m, tile_m * num_tiles): - - offset_map = AffineMap.get( - 0, - 2, - [ - AffineExpr.get_add( - AffineSymbolExpr.get(0), - AffineExpr.get_mul( - AffineSymbolExpr.get(1), - AffineConstantExpr.get(tile_m), - ), - ) - ], - ) - offset = affine_apply(offset_map, [_l_ivx, _ty]) + offset = tiled_1d_offset(_l_ivx, _ty, tile_m) dma_memcpy_nd( l1_a_data, @@ -144,7 +136,7 @@ def herd_body( VectorType.get([n], xrt_dtype_in), collapse_a, [c0], - AffineMapAttr.get(AffineMap.get_identity(1)), + identity_map_attr(), cst0, [True], ) @@ -168,26 +160,12 @@ def herd_body( if __name__ == "__main__": - # Default values. M = 65536 N = 32 TILE_M = 256 INPUT_DATATYPE = bfloat16 - parser = argparse.ArgumentParser( - prog="run.py", - description="Builds, runs, and tests the passthrough_dma example", - ) - parser.add_argument( - "-v", - "--verbose", - action="store_true", - ) - parser.add_argument( - "-p", - "--print-module-only", - action="store_true", - ) + parser = make_air_parser("Builds, runs, and tests the passthrough_dma example") parser.add_argument( "--m", type=int, @@ -201,22 +179,6 @@ def herd_body( help="Input size (dimension N)", ) parser.add_argument("--tile-m", type=int, default=TILE_M, help="Tile size M") - parser.add_argument( - "--compile-mode", - type=str, - choices=["compile-only", "compile-and-run"], - dest="compile_mode", - default="compile-and-run", - help="Configure to whether to run after compile", - ) - parser.add_argument( - "--output-format", - type=str, - choices=["xclbin", "elf"], - default="xclbin", - dest="output_format", - help="Output format for the compiled binary (default: xclbin)", - ) args = parser.parse_args() @@ -226,58 +188,36 @@ def herd_body( args.tile_m, INPUT_DATATYPE, ) - if args.print_module_only: - print(mlir_module) - exit(0) + check_print_module(mlir_module, args) input_a = np.arange(0, (args.m * args.n), dtype=INPUT_DATATYPE).reshape( args.m, args.n ) - if args.compile_mode == "compile-and-run": - - # Stochastically sample num_sample results, and pass to XRTRunner backend for verification. - num_samples = 100 - sampled_indices = np.vstack([np.random.randint(0, args.m, num_samples)]) - - # Compute reference results for sampled indices - sampled_values = np.array( - [np.max(input_a[i]) for i in zip(*sampled_indices)], - dtype=INPUT_DATATYPE, - ) + # Stochastically sample num_sample results for verification. + num_samples = 100 + sampled_indices = np.vstack([np.random.randint(0, args.m, num_samples)]) - # Store as a dictionary - sampled_data = { - "shape": (args.m,), - "indices": sampled_indices, - "values": sampled_values, - } + # Compute reference results for sampled indices + sampled_values = np.array( + [np.max(input_a[i]) for i in zip(*sampled_indices)], + dtype=INPUT_DATATYPE, + ) - ###### Compile and test - runner = XRTRunner( - verbose=args.verbose, - omit_while_true_loop=False, - output_format=args.output_format, + # Store as a dictionary + sampled_data = { + "shape": (args.m,), + "indices": sampled_indices, + "values": sampled_values, + } + + exit( + run_on_npu( + args, + mlir_module, + inputs=[input_a], instance_name="vector_reduce_max", - runtime_loop_tiling_sizes=[4, 4], - ) - exit( - runner.run_test( - mlir_module, - inputs=[input_a], - stochastic_expected_outputs=[sampled_data], - rtol=1e-1, - ) - ) - - elif args.compile_mode == "compile-only": - ###### Compile only - backend = XRTBackend( - verbose=args.verbose, - omit_while_true_loop=False, - output_format=args.output_format, - runtime_loop_tiling_sizes=[4, 4], + stochastic_expected_outputs=[sampled_data], + rtol=1e-1, ) - module_function = backend.compile(mlir_module) - - backend.unload() + ) diff --git a/programming_examples/primitives/vector_examples/vector_rsqrt/vector_rsqrt_v1.py b/programming_examples/primitives/vector_examples/vector_rsqrt/vector_rsqrt_v1.py index 8c0bd638f..26ce44e4c 100644 --- a/programming_examples/primitives/vector_examples/vector_rsqrt/vector_rsqrt_v1.py +++ b/programming_examples/primitives/vector_examples/vector_rsqrt/vector_rsqrt_v1.py @@ -1,21 +1,39 @@ # Copyright (C) 2025, Advanced Micro Devices, Inc. # SPDX-License-Identifier: MIT # Version 1: f32 vector rsqrt -import argparse +import os +import sys + +sys.path.insert( + 0, + os.path.dirname( + os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + ), +) + import numpy as np from ml_dtypes import bfloat16 from air.ir import * -from air.dialects.affine import apply as affine_apply from air.dialects.air import * from air.dialects.arith import ConstantOp -from air.dialects.memref import AllocOp, DeallocOp, load, store, subview -from air.dialects.vector import transfer_read, transfer_write +from air.dialects.memref import AllocOp, DeallocOp from air.dialects.func import FuncOp from air.dialects.math import rsqrt from air.dialects.scf import for_, yield_ -from air.backend.xrt_runner import XRTRunner, type_mapper -from air.backend.xrt import XRTBackend +from air.backend.xrt_runner import type_mapper +from utils import ( + make_l1_memref, + make_vec_type, + identity_map_1d, + tiled_1d_offset, + vec_read, + vec_write, + make_air_parser, + run_on_npu, + stochastic_check, + check_print_module, +) range_ = for_ @@ -23,22 +41,16 @@ @module_builder def build_module(n, tile_n, np_dtype_in, arch="aie2"): a_size = [n] - out_size = a_size xrt_dtype_in = type_mapper(np_dtype_in) num_tiles = 2 assert n % (tile_n * num_tiles) == 0 VECTOR_SIZE = 16 index_type = IndexType.get() - # L3 MemRefTypes l3memrefTy = MemRefType.get(a_size, xrt_dtype_in) - - # L1 MemRefTypes - l1MemrefTy = MemRefType.get( - shape=[tile_n], - element_type=xrt_dtype_in, - memory_space=IntegerAttr.get(T.i32(), MemorySpace.L1), - ) + l1MemrefTy = make_l1_memref([tile_n], xrt_dtype_in) + vecTy = make_vec_type(VECTOR_SIZE, xrt_dtype_in) + imap = identity_map_1d() @FuncOp.from_py_func(l3memrefTy, l3memrefTy) def vector_rsqrt(arg0, arg2): @@ -64,75 +76,29 @@ def herd_body( l1_out_data = AllocOp(l1MemrefTy, [], []) for _l_ivx in range_(0, n, tile_n * num_tiles): - - offset_map = AffineMap.get( - 0, - 2, - [ - AffineExpr.get_add( - AffineSymbolExpr.get(0), - AffineExpr.get_mul( - AffineSymbolExpr.get(1), - AffineConstantExpr.get(tile_n), - ), - ) - ], - ) - offset = affine_apply(offset_map, [_l_ivx, _ty]) + offset = tiled_1d_offset(_l_ivx, _ty, tile_n) dma_memcpy_nd( l1_a_data, _l3_a, - src_offsets=[ - offset, - ], + src_offsets=[offset], src_sizes=[tile_n], src_strides=[1], ) c0 = ConstantOp(index_type, 0) - c1 = ConstantOp(index_type, 1) cVecSize = ConstantOp(index_type, VECTOR_SIZE) cTileN = ConstantOp(index_type, tile_n) + cst0 = arith.ConstantOp(xrt_dtype_in, 0.0) for j in range_(c0, cTileN, cVecSize): - sub_a_vec = subview( - l1_a_data.result, - [j], - [VECTOR_SIZE], - [1], - ) - sub_c_vec = subview( - l1_out_data.result, - [j], - [VECTOR_SIZE], - [1], - ) - - cst0 = arith.ConstantOp(xrt_dtype_in, 0.0) - v_a = transfer_read( - VectorType.get([VECTOR_SIZE], xrt_dtype_in), - sub_a_vec, - [c0], - AffineMapAttr.get(AffineMap.get_identity(1)), - cst0, - [True], - ) + v_a = vec_read(l1_a_data, j, VECTOR_SIZE, c0, vecTy, cst0, imap) v_c = rsqrt(v_a) - transfer_write( - None, - v_c, - sub_c_vec, - [c0], - AffineMapAttr.get(AffineMap.get_identity(1)), - [True], - ) + vec_write(v_c, l1_out_data, j, VECTOR_SIZE, c0, imap) yield_([]) dma_memcpy_nd( _l3_c, l1_out_data, - dst_offsets=[ - offset, - ], + dst_offsets=[offset], dst_sizes=[tile_n], dst_strides=[1], ) @@ -143,23 +109,11 @@ def herd_body( if __name__ == "__main__": - # Default values. N = 512 TILE_N = 64 - parser = argparse.ArgumentParser( - prog="run.py", - description="Builds, runs, and tests the vector_rsqrt example (Version 1: f32 vector)", - ) - parser.add_argument( - "-v", - "--verbose", - action="store_true", - ) - parser.add_argument( - "-p", - "--print-module-only", - action="store_true", + parser = make_air_parser( + "Builds, runs, and tests the vector_rsqrt example (Version 1: f32 vector)" ) parser.add_argument( "--n", @@ -175,22 +129,6 @@ def herd_body( default="aie2", help="Target AIE architecture (aie2 or aie2p)", ) - parser.add_argument( - "--compile-mode", - type=str, - choices=["compile-only", "compile-and-run"], - dest="compile_mode", - default="compile-and-run", - help="Configure to whether to run after compile", - ) - parser.add_argument( - "--output-format", - type=str, - choices=["xclbin", "elf"], - default="xclbin", - dest="output_format", - help="Output format for the compiled binary (default: xclbin)", - ) args = parser.parse_args() @@ -208,63 +146,34 @@ def herd_body( INPUT_DATATYPE, args.arch, ) - if args.print_module_only: - print(mlir_module) - exit(0) + check_print_module(mlir_module, args) # Generate input values in range [0.1, 3.0] to match working testbench pattern - # This ensures positive values (required for rsqrt) and stays well within bfloat16 range np.random.seed(10) input_a = np.abs(np.random.uniform(0.1, 3.0, args.n)).astype(INPUT_DATATYPE) - if args.compile_mode == "compile-and-run": - - # Stochastically sample num_sample results, and pass to XRTRunner backend for verification. - num_samples = 100 - sampled_indices = np.vstack( - [ - np.random.randint(0, args.n, num_samples), # i indices - ] - ) - - # Compute reference results for sampled indices - sampled_values = np.array( - [1.0 / np.sqrt(input_a[i]) for i in sampled_indices[0]], - dtype=INPUT_DATATYPE, - ) - - # Store as a dictionary - sampled_data = { - "shape": (args.n,), - "indices": sampled_indices, - "values": sampled_values, - } - - ###### Compile and test - runner = XRTRunner( - verbose=args.verbose, - omit_while_true_loop=False, - output_format=args.output_format, + num_samples = 100 + sampled_indices = np.vstack( + [ + np.random.randint(0, args.n, num_samples), + ] + ) + sampled_values = np.array( + [1.0 / np.sqrt(input_a[i]) for i in sampled_indices[0]], + dtype=INPUT_DATATYPE, + ) + sampled_data = { + "shape": (args.n,), + "indices": sampled_indices, + "values": sampled_values, + } + exit( + run_on_npu( + args, + mlir_module, + inputs=[input_a], instance_name="vector_rsqrt", - runtime_loop_tiling_sizes=[4, 4], + stochastic_expected_outputs=[sampled_data], + rtol=1e-1, ) - exit( - runner.run_test( - mlir_module, - inputs=[input_a], - stochastic_expected_outputs=[sampled_data], - rtol=1e-1, - ) - ) - - elif args.compile_mode == "compile-only": - ###### Compile only - backend = XRTBackend( - verbose=args.verbose, - omit_while_true_loop=False, - output_format=args.output_format, - runtime_loop_tiling_sizes=[4, 4], - ) - module_function = backend.compile(mlir_module) - - backend.unload() + ) diff --git a/programming_examples/primitives/vector_examples/vector_rsqrt/vector_rsqrt_v2.py b/programming_examples/primitives/vector_examples/vector_rsqrt/vector_rsqrt_v2.py index 02601fa95..9f54a7dfe 100644 --- a/programming_examples/primitives/vector_examples/vector_rsqrt/vector_rsqrt_v2.py +++ b/programming_examples/primitives/vector_examples/vector_rsqrt/vector_rsqrt_v2.py @@ -1,20 +1,34 @@ # Copyright (C) 2025, Advanced Micro Devices, Inc. # SPDX-License-Identifier: MIT # Version 2: f32 scalar rsqrt in loop -import argparse +import os +import sys + +sys.path.insert( + 0, + os.path.dirname( + os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + ), +) + +import numpy as np from ml_dtypes import bfloat16 from air.ir import * -from air.dialects.affine import apply as affine_apply from air.dialects.air import * from air.dialects.arith import ConstantOp from air.dialects.memref import AllocOp, DeallocOp, load, store, subview -from air.dialects.vector import transfer_read, transfer_write from air.dialects.func import FuncOp from air.dialects.math import rsqrt from air.dialects.scf import for_, yield_ -from air.backend.xrt_runner import XRTRunner, type_mapper -from air.backend.xrt import XRTBackend +from air.backend.xrt_runner import type_mapper +from utils import ( + make_l1_memref, + tiled_1d_offset, + make_air_parser, + run_on_npu, + check_print_module, +) range_ = for_ @@ -22,22 +36,14 @@ @module_builder def build_module(n, tile_n, np_dtype_in, arch="aie2"): a_size = [n] - out_size = a_size xrt_dtype_in = type_mapper(np_dtype_in) num_tiles = 2 assert n % (tile_n * num_tiles) == 0 VECTOR_SIZE = 16 index_type = IndexType.get() - # L3 MemRefTypes l3memrefTy = MemRefType.get(a_size, xrt_dtype_in) - - # L1 MemRefTypes - l1MemrefTy = MemRefType.get( - shape=[tile_n], - element_type=xrt_dtype_in, - memory_space=IntegerAttr.get(T.i32(), MemorySpace.L1), - ) + l1MemrefTy = make_l1_memref([tile_n], xrt_dtype_in) @FuncOp.from_py_func(l3memrefTy, l3memrefTy) def vector_rsqrt(arg0, arg2): @@ -63,28 +69,12 @@ def herd_body( l1_out_data = AllocOp(l1MemrefTy, [], []) for _l_ivx in range_(0, n, tile_n * num_tiles): - - offset_map = AffineMap.get( - 0, - 2, - [ - AffineExpr.get_add( - AffineSymbolExpr.get(0), - AffineExpr.get_mul( - AffineSymbolExpr.get(1), - AffineConstantExpr.get(tile_n), - ), - ) - ], - ) - offset = affine_apply(offset_map, [_l_ivx, _ty]) + offset = tiled_1d_offset(_l_ivx, _ty, tile_n) dma_memcpy_nd( l1_a_data, _l3_a, - src_offsets=[ - offset, - ], + src_offsets=[offset], src_sizes=[tile_n], src_strides=[1], ) @@ -121,9 +111,7 @@ def herd_body( dma_memcpy_nd( _l3_c, l1_out_data, - dst_offsets=[ - offset, - ], + dst_offsets=[offset], dst_sizes=[tile_n], dst_strides=[1], ) @@ -134,24 +122,12 @@ def herd_body( if __name__ == "__main__": - # Default values. N = 512 TILE_N = 64 INPUT_DATATYPE = np.float32 - parser = argparse.ArgumentParser( - prog="run.py", - description="Builds, runs, and tests the vector_rsqrt example (Version 2: f32 scalar in loop)", - ) - parser.add_argument( - "-v", - "--verbose", - action="store_true", - ) - parser.add_argument( - "-p", - "--print-module-only", - action="store_true", + parser = make_air_parser( + "Builds, runs, and tests the vector_rsqrt example (Version 2: f32 scalar in loop)" ) parser.add_argument( "--n", @@ -167,22 +143,6 @@ def herd_body( default="aie2", help="Target AIE architecture (aie2 or aie2p)", ) - parser.add_argument( - "--compile-mode", - type=str, - choices=["compile-only", "compile-and-run"], - dest="compile_mode", - default="compile-and-run", - help="Configure to whether to run after compile", - ) - parser.add_argument( - "--output-format", - type=str, - choices=["xclbin", "elf"], - default="xclbin", - dest="output_format", - help="Output format for the compiled binary (default: xclbin)", - ) args = parser.parse_args() @@ -200,63 +160,34 @@ def herd_body( INPUT_DATATYPE, args.arch, ) - if args.print_module_only: - print(mlir_module) - exit(0) + check_print_module(mlir_module, args) # Generate input values in range [0.1, 3.0] to match working testbench pattern - # This ensures positive values (required for rsqrt) and stays well within bfloat16 range np.random.seed(10) input_a = np.abs(np.random.uniform(0.1, 3.0, args.n)).astype(INPUT_DATATYPE) - if args.compile_mode == "compile-and-run": - - # Stochastically sample num_sample results, and pass to XRTRunner backend for verification. - num_samples = 100 - sampled_indices = np.vstack( - [ - np.random.randint(0, args.n, num_samples), # i indices - ] - ) - - # Compute reference results for sampled indices - sampled_values = np.array( - [1.0 / np.sqrt(input_a[i]) for i in sampled_indices[0]], - dtype=INPUT_DATATYPE, - ) - - # Store as a dictionary - sampled_data = { - "shape": (args.n,), - "indices": sampled_indices, - "values": sampled_values, - } - - ###### Compile and test - runner = XRTRunner( - verbose=args.verbose, - omit_while_true_loop=False, - output_format=args.output_format, + num_samples = 100 + sampled_indices = np.vstack( + [ + np.random.randint(0, args.n, num_samples), + ] + ) + sampled_values = np.array( + [1.0 / np.sqrt(input_a[i]) for i in sampled_indices[0]], + dtype=INPUT_DATATYPE, + ) + sampled_data = { + "shape": (args.n,), + "indices": sampled_indices, + "values": sampled_values, + } + exit( + run_on_npu( + args, + mlir_module, + inputs=[input_a], instance_name="vector_rsqrt", - runtime_loop_tiling_sizes=[4, 4], + stochastic_expected_outputs=[sampled_data], + rtol=1e-1, ) - exit( - runner.run_test( - mlir_module, - inputs=[input_a], - stochastic_expected_outputs=[sampled_data], - rtol=1e-1, - ) - ) - - elif args.compile_mode == "compile-only": - ###### Compile only - backend = XRTBackend( - verbose=args.verbose, - omit_while_true_loop=False, - output_format=args.output_format, - runtime_loop_tiling_sizes=[4, 4], - ) - module_function = backend.compile(mlir_module) - - backend.unload() + ) diff --git a/programming_examples/primitives/vector_examples/vector_rsqrt/vector_rsqrt_v3.py b/programming_examples/primitives/vector_examples/vector_rsqrt/vector_rsqrt_v3.py index f15f81325..bf5ff4cbc 100644 --- a/programming_examples/primitives/vector_examples/vector_rsqrt/vector_rsqrt_v3.py +++ b/programming_examples/primitives/vector_examples/vector_rsqrt/vector_rsqrt_v3.py @@ -1,20 +1,38 @@ # Copyright (C) 2025, Advanced Micro Devices, Inc. # SPDX-License-Identifier: MIT # Version 3: bf16 vector rsqrt with f32 conversion -import argparse +import os +import sys + +sys.path.insert( + 0, + os.path.dirname( + os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + ), +) + +import numpy as np from ml_dtypes import bfloat16 from air.ir import * -from air.dialects.affine import apply as affine_apply from air.dialects.air import * from air.dialects.arith import ConstantOp, extf, truncf -from air.dialects.memref import AllocOp, DeallocOp, load, store, subview -from air.dialects.vector import transfer_read, transfer_write +from air.dialects.memref import AllocOp, DeallocOp from air.dialects.func import FuncOp from air.dialects.math import rsqrt from air.dialects.scf import for_, yield_ -from air.backend.xrt_runner import XRTRunner, type_mapper -from air.backend.xrt import XRTBackend +from air.backend.xrt_runner import type_mapper +from utils import ( + make_l1_memref, + make_vec_type, + identity_map_1d, + tiled_1d_offset, + vec_read, + vec_write, + make_air_parser, + run_on_npu, + check_print_module, +) range_ = for_ @@ -22,22 +40,16 @@ @module_builder def build_module(n, tile_n, np_dtype_in, arch="aie2"): a_size = [n] - out_size = a_size xrt_dtype_in = type_mapper(np_dtype_in) num_tiles = 2 assert n % (tile_n * num_tiles) == 0 VECTOR_SIZE = 16 index_type = IndexType.get() - # L3 MemRefTypes l3memrefTy = MemRefType.get(a_size, xrt_dtype_in) - - # L1 MemRefTypes - l1MemrefTy = MemRefType.get( - shape=[tile_n], - element_type=xrt_dtype_in, - memory_space=IntegerAttr.get(T.i32(), MemorySpace.L1), - ) + l1MemrefTy = make_l1_memref([tile_n], xrt_dtype_in) + vecTy = make_vec_type(VECTOR_SIZE, xrt_dtype_in) + imap = identity_map_1d() @FuncOp.from_py_func(l3memrefTy, l3memrefTy) def vector_rsqrt(arg0, arg2): @@ -63,58 +75,23 @@ def herd_body( l1_out_data = AllocOp(l1MemrefTy, [], []) for _l_ivx in range_(0, n, tile_n * num_tiles): - - offset_map = AffineMap.get( - 0, - 2, - [ - AffineExpr.get_add( - AffineSymbolExpr.get(0), - AffineExpr.get_mul( - AffineSymbolExpr.get(1), - AffineConstantExpr.get(tile_n), - ), - ) - ], - ) - offset = affine_apply(offset_map, [_l_ivx, _ty]) + offset = tiled_1d_offset(_l_ivx, _ty, tile_n) dma_memcpy_nd( l1_a_data, _l3_a, - src_offsets=[ - offset, - ], + src_offsets=[offset], src_sizes=[tile_n], src_strides=[1], ) c0 = ConstantOp(index_type, 0) - c1 = ConstantOp(index_type, 1) cVecSize = ConstantOp(index_type, VECTOR_SIZE) cTileN = ConstantOp(index_type, tile_n) + cst0 = arith.ConstantOp(xrt_dtype_in, 0.0) for j in range_(c0, cTileN, cVecSize): - sub_a_vec = subview( - l1_a_data.result, - [j], - [VECTOR_SIZE], - [1], - ) - sub_c_vec = subview( - l1_out_data.result, - [j], - [VECTOR_SIZE], - [1], - ) - # Load bf16 vector - cst0 = arith.ConstantOp(xrt_dtype_in, 0.0) - v_a_bf16 = transfer_read( - VectorType.get([VECTOR_SIZE], xrt_dtype_in), - sub_a_vec, - [c0], - AffineMapAttr.get(AffineMap.get_identity(1)), - cst0, - [True], + v_a_bf16 = vec_read( + l1_a_data, j, VECTOR_SIZE, c0, vecTy, cst0, imap ) # Extend bf16 to f32 @@ -133,22 +110,13 @@ def herd_body( ) # Store bf16 vector - transfer_write( - None, - v_c_bf16, - sub_c_vec, - [c0], - AffineMapAttr.get(AffineMap.get_identity(1)), - [True], - ) + vec_write(v_c_bf16, l1_out_data, j, VECTOR_SIZE, c0, imap) yield_([]) dma_memcpy_nd( _l3_c, l1_out_data, - dst_offsets=[ - offset, - ], + dst_offsets=[offset], dst_sizes=[tile_n], dst_strides=[1], ) @@ -159,24 +127,12 @@ def herd_body( if __name__ == "__main__": - # Default values. N = 512 TILE_N = 64 INPUT_DATATYPE = bfloat16 - parser = argparse.ArgumentParser( - prog="run.py", - description="Builds, runs, and tests the vector_rsqrt example (Version 3: bf16 vector with f32 conversion)", - ) - parser.add_argument( - "-v", - "--verbose", - action="store_true", - ) - parser.add_argument( - "-p", - "--print-module-only", - action="store_true", + parser = make_air_parser( + "Builds, runs, and tests the vector_rsqrt example (Version 3: bf16 vector with f32 conversion)" ) parser.add_argument( "--n", @@ -192,22 +148,6 @@ def herd_body( default="aie2", help="Target AIE architecture (aie2 or aie2p)", ) - parser.add_argument( - "--compile-mode", - type=str, - choices=["compile-only", "compile-and-run"], - dest="compile_mode", - default="compile-and-run", - help="Configure to whether to run after compile", - ) - parser.add_argument( - "--output-format", - type=str, - choices=["xclbin", "elf"], - default="xclbin", - dest="output_format", - help="Output format for the compiled binary (default: xclbin)", - ) args = parser.parse_args() @@ -217,63 +157,34 @@ def herd_body( INPUT_DATATYPE, args.arch, ) - if args.print_module_only: - print(mlir_module) - exit(0) + check_print_module(mlir_module, args) - # Generate input values in range [0.1, 3.0] to match working testbench pattern - # This ensures positive values (required for rsqrt) and stays well within bfloat16 range + # Generate input values in range [0.1, 3.0] np.random.seed(10) input_a = np.abs(np.random.uniform(0.1, 3.0, args.n)).astype(INPUT_DATATYPE) - if args.compile_mode == "compile-and-run": - - # Stochastically sample num_sample results, and pass to XRTRunner backend for verification. - num_samples = 100 - sampled_indices = np.vstack( - [ - np.random.randint(0, args.n, num_samples), # i indices - ] - ) - - # Compute reference results for sampled indices - sampled_values = np.array( - [1.0 / np.sqrt(input_a[i]) for i in sampled_indices[0]], - dtype=INPUT_DATATYPE, - ) - - # Store as a dictionary - sampled_data = { - "shape": (args.n,), - "indices": sampled_indices, - "values": sampled_values, - } - - ###### Compile and test - runner = XRTRunner( - verbose=args.verbose, - omit_while_true_loop=False, - output_format=args.output_format, + num_samples = 100 + sampled_indices = np.vstack( + [ + np.random.randint(0, args.n, num_samples), + ] + ) + sampled_values = np.array( + [1.0 / np.sqrt(input_a[i]) for i in sampled_indices[0]], + dtype=INPUT_DATATYPE, + ) + sampled_data = { + "shape": (args.n,), + "indices": sampled_indices, + "values": sampled_values, + } + exit( + run_on_npu( + args, + mlir_module, + inputs=[input_a], instance_name="vector_rsqrt", - runtime_loop_tiling_sizes=[4, 4], - ) - exit( - runner.run_test( - mlir_module, - inputs=[input_a], - stochastic_expected_outputs=[sampled_data], - rtol=1e-1, - ) + stochastic_expected_outputs=[sampled_data], + rtol=1e-1, ) - - elif args.compile_mode == "compile-only": - ###### Compile only - backend = XRTBackend( - verbose=args.verbose, - omit_while_true_loop=False, - output_format=args.output_format, - runtime_loop_tiling_sizes=[4, 4], - ) - module_function = backend.compile(mlir_module) - - backend.unload() + ) diff --git a/programming_examples/primitives/vector_examples/vector_select/vector_select.py b/programming_examples/primitives/vector_examples/vector_select/vector_select.py index 6f7deb79a..ea6e73067 100644 --- a/programming_examples/primitives/vector_examples/vector_select/vector_select.py +++ b/programming_examples/primitives/vector_examples/vector_select/vector_select.py @@ -1,18 +1,36 @@ # Copyright (C) 2026, Advanced Micro Devices, Inc. # SPDX-License-Identifier: MIT -import argparse +import os +import sys + +sys.path.insert( + 0, + os.path.dirname( + os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + ), +) + from ml_dtypes import bfloat16 from air.ir import * -from air.dialects.affine import apply as affine_apply from air.dialects.air import * from air.dialects.arith import ConstantOp, cmpf, select, CmpFPredicate -from air.dialects.memref import AllocOp, DeallocOp, load, store, subview -from air.dialects.vector import transfer_read, transfer_write +from air.dialects.memref import AllocOp, DeallocOp from air.dialects.func import FuncOp from air.dialects.scf import for_, yield_ -from air.backend.xrt_runner import XRTRunner, type_mapper -from air.backend.xrt import XRTBackend +from air.backend.xrt_runner import type_mapper +from utils import ( + make_l1_memref, + make_vec_type, + identity_map_1d, + tiled_1d_offset, + vec_read, + vec_write, + make_air_parser, + run_on_npu, + stochastic_check, + check_print_module, +) import numpy as np @@ -24,23 +42,16 @@ @module_builder def build_module(n, tile_n, np_dtype_in, vector_size=16): a_size = [n] - b_size = a_size - out_size = a_size xrt_dtype_in = type_mapper(np_dtype_in) num_tiles = 2 assert n % (tile_n * num_tiles) == 0 VECTOR_SIZE = vector_size index_type = IndexType.get() - # L3 MemRefTypes l3memrefTy = MemRefType.get(a_size, xrt_dtype_in) - - # L1 MemRefTypes - l1MemrefTy = MemRefType.get( - shape=[tile_n], - element_type=xrt_dtype_in, - memory_space=IntegerAttr.get(T.i32(), MemorySpace.L1), - ) + l1MemrefTy = make_l1_memref([tile_n], xrt_dtype_in) + vecTy = make_vec_type(VECTOR_SIZE, xrt_dtype_in) + imap = identity_map_1d() @FuncOp.from_py_func(l3memrefTy, l3memrefTy, l3memrefTy) def vector_select(arg0, arg1, arg2): @@ -63,100 +74,40 @@ def herd_body( l1_out_data = AllocOp(l1MemrefTy, [], []) for _l_ivx in range_(0, n, tile_n * num_tiles): - - offset_map = AffineMap.get( - 0, - 2, - [ - AffineExpr.get_add( - AffineSymbolExpr.get(0), - AffineExpr.get_mul( - AffineSymbolExpr.get(1), - AffineConstantExpr.get(tile_n), - ), - ) - ], - ) - offset = affine_apply(offset_map, [_l_ivx, _ty]) + offset = tiled_1d_offset(_l_ivx, _ty, tile_n) dma_memcpy_nd( l1_a_data, _l3_a, - src_offsets=[ - offset, - ], + src_offsets=[offset], src_sizes=[tile_n], src_strides=[1], ) dma_memcpy_nd( l1_b_data, _l3_b, - src_offsets=[ - offset, - ], + src_offsets=[offset], src_sizes=[tile_n], src_strides=[1], ) c0 = ConstantOp(index_type, 0) - c1 = ConstantOp(index_type, 1) cVecSize = ConstantOp(index_type, VECTOR_SIZE) cTileN = ConstantOp(index_type, tile_n) + cst0 = arith.ConstantOp(xrt_dtype_in, 0.0) for j in range_(c0, cTileN, cVecSize): - sub_a_vec = subview( - l1_a_data.result, - [j], - [VECTOR_SIZE], - [1], - ) - sub_b_vec = subview( - l1_b_data.result, - [j], - [VECTOR_SIZE], - [1], - ) - sub_c_vec = subview( - l1_out_data.result, - [j], - [VECTOR_SIZE], - [1], - ) - cst0 = arith.ConstantOp(xrt_dtype_in, 0.0) - v_a = transfer_read( - VectorType.get([VECTOR_SIZE], xrt_dtype_in), - sub_a_vec, - [c0], - AffineMapAttr.get(AffineMap.get_identity(1)), - cst0, - [True], - ) - v_b = transfer_read( - VectorType.get([VECTOR_SIZE], xrt_dtype_in), - sub_b_vec, - [c0], - AffineMapAttr.get(AffineMap.get_identity(1)), - cst0, - [True], - ) + v_a = vec_read(l1_a_data, j, VECTOR_SIZE, c0, vecTy, cst0, imap) + v_b = vec_read(l1_b_data, j, VECTOR_SIZE, c0, vecTy, cst0, imap) # Compare: a >= b (ordered greater-or-equal) cmp_result = cmpf(CmpFPredicate.OGE, v_a, v_b) # Select: when cmp is true (a >= b), pick a; otherwise pick b v_c = select(cmp_result, v_a, v_b) - transfer_write( - None, - v_c, - sub_c_vec, - [c0], - AffineMapAttr.get(AffineMap.get_identity(1)), - [True], - ) + vec_write(v_c, l1_out_data, j, VECTOR_SIZE, c0, imap) yield_([]) dma_memcpy_nd( _l3_c, l1_out_data, - dst_offsets=[ - offset, - ], + dst_offsets=[offset], dst_sizes=[tile_n], dst_strides=[1], ) @@ -168,26 +119,12 @@ def herd_body( if __name__ == "__main__": - # Default values. N = 65536 TILE_N = 1024 VECTOR_SIZE = 16 INPUT_DATATYPE = bfloat16 - parser = argparse.ArgumentParser( - prog="run.py", - description="Builds, runs, and tests the vector_select example", - ) - parser.add_argument( - "-v", - "--verbose", - action="store_true", - ) - parser.add_argument( - "-p", - "--print-module-only", - action="store_true", - ) + parser = make_air_parser("Builds, runs, and tests the vector_select example") parser.add_argument( "--n", type=int, @@ -201,22 +138,6 @@ def herd_body( default=VECTOR_SIZE, help="Vector size for SIMD operations", ) - parser.add_argument( - "--compile-mode", - type=str, - choices=["compile-only", "compile-and-run"], - dest="compile_mode", - default="compile-and-run", - help="Configure to whether to run after compile", - ) - parser.add_argument( - "--output-format", - type=str, - choices=["xclbin", "elf"], - default="xclbin", - dest="output_format", - help="Output format for the compiled binary (default: xclbin)", - ) parser.add_argument( "--bf16-emulation", dest="bf16_emulation", @@ -237,63 +158,25 @@ def herd_body( INPUT_DATATYPE, args.vector_size, ) - if args.print_module_only: - print(mlir_module) - exit(0) + check_print_module(mlir_module, args) input_a = np.random.uniform(-100.0, 100.0, args.n).astype(INPUT_DATATYPE) input_b = np.random.uniform(-100.0, 100.0, args.n).astype(INPUT_DATATYPE) - if args.compile_mode == "compile-and-run": - - # Stochastically sample num_sample results, and pass to XRTRunner backend for verification. - num_samples = 100 - sampled_indices = np.vstack( - [ - np.random.randint(0, args.n, num_samples), # i indices - ] - ) - - # Compute reference results for sampled indices - sampled_values = np.array( - [max(input_a[i], input_b[i]) for i in zip(*sampled_indices)], - dtype=INPUT_DATATYPE, - ) - - # Store as a dictionary - sampled_data = { - "shape": (args.n,), - "indices": sampled_indices, - "values": sampled_values, - } - - ###### Compile and test - runner = XRTRunner( - verbose=args.verbose, - omit_while_true_loop=False, - output_format=args.output_format, + sampled_data = stochastic_check( + [input_a, input_b], + args.n, + lambda a, b: max(a, b), + INPUT_DATATYPE, + ) + exit( + run_on_npu( + args, + mlir_module, + inputs=[input_a, input_b], instance_name="vector_select", + stochastic_expected_outputs=[sampled_data], + rtol=5e-2 if bf16_emulation else 1e-3, bf16_emulation=bf16_emulation, - runtime_loop_tiling_sizes=[4, 4], - ) - exit( - runner.run_test( - mlir_module, - inputs=[input_a, input_b], - stochastic_expected_outputs=[sampled_data], - rtol=5e-2 if bf16_emulation else 1e-3, - ) - ) - - elif args.compile_mode == "compile-only": - ###### Compile only - backend = XRTBackend( - verbose=args.verbose, - omit_while_true_loop=False, - output_format=args.output_format, - bf16_emulation=bf16_emulation, - runtime_loop_tiling_sizes=[4, 4], ) - module_function = backend.compile(mlir_module) - - backend.unload() + ) diff --git a/programming_examples/primitives/vector_examples/vector_sub/vector_sub.py b/programming_examples/primitives/vector_examples/vector_sub/vector_sub.py index 9403059e8..b1ae243db 100644 --- a/programming_examples/primitives/vector_examples/vector_sub/vector_sub.py +++ b/programming_examples/primitives/vector_examples/vector_sub/vector_sub.py @@ -1,18 +1,36 @@ # Copyright (C) 2025, Advanced Micro Devices, Inc. # SPDX-License-Identifier: MIT -import argparse +import os +import sys + +sys.path.insert( + 0, + os.path.dirname( + os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + ), +) + from ml_dtypes import bfloat16 from air.ir import * -from air.dialects.affine import apply as affine_apply from air.dialects.air import * from air.dialects.arith import ConstantOp -from air.dialects.memref import AllocOp, DeallocOp, load, store, subview -from air.dialects.vector import transfer_read, transfer_write +from air.dialects.memref import AllocOp, DeallocOp from air.dialects.func import FuncOp from air.dialects.scf import for_, yield_ -from air.backend.xrt_runner import XRTRunner, type_mapper -from air.backend.xrt import XRTBackend +from air.backend.xrt_runner import type_mapper +from utils import ( + make_l1_memref, + make_vec_type, + identity_map_1d, + tiled_1d_offset, + vec_read, + vec_write, + make_air_parser, + run_on_npu, + stochastic_check, + check_print_module, +) import numpy as np @@ -24,23 +42,16 @@ @module_builder def build_module(n, tile_n, np_dtype_in, vector_size=16): a_size = [n] - b_size = a_size - out_size = a_size xrt_dtype_in = type_mapper(np_dtype_in) num_tiles = 2 assert n % (tile_n * num_tiles) == 0 VECTOR_SIZE = vector_size index_type = IndexType.get() - # L3 MemRefTypes l3memrefTy = MemRefType.get(a_size, xrt_dtype_in) - - # L1 MemRefTypes - l1MemrefTy = MemRefType.get( - shape=[tile_n], - element_type=xrt_dtype_in, - memory_space=IntegerAttr.get(T.i32(), MemorySpace.L1), - ) + l1MemrefTy = make_l1_memref([tile_n], xrt_dtype_in) + vecTy = make_vec_type(VECTOR_SIZE, xrt_dtype_in) + imap = identity_map_1d() @FuncOp.from_py_func(l3memrefTy, l3memrefTy, l3memrefTy) def vector_sub(arg0, arg1, arg2): @@ -63,97 +74,37 @@ def herd_body( l1_out_data = AllocOp(l1MemrefTy, [], []) for _l_ivx in range_(0, n, tile_n * num_tiles): - - offset_map = AffineMap.get( - 0, - 2, - [ - AffineExpr.get_add( - AffineSymbolExpr.get(0), - AffineExpr.get_mul( - AffineSymbolExpr.get(1), - AffineConstantExpr.get(tile_n), - ), - ) - ], - ) - offset = affine_apply(offset_map, [_l_ivx, _ty]) + offset = tiled_1d_offset(_l_ivx, _ty, tile_n) dma_memcpy_nd( l1_a_data, _l3_a, - src_offsets=[ - offset, - ], + src_offsets=[offset], src_sizes=[tile_n], src_strides=[1], ) dma_memcpy_nd( l1_b_data, _l3_b, - src_offsets=[ - offset, - ], + src_offsets=[offset], src_sizes=[tile_n], src_strides=[1], ) c0 = ConstantOp(index_type, 0) - c1 = ConstantOp(index_type, 1) cVecSize = ConstantOp(index_type, VECTOR_SIZE) cTileN = ConstantOp(index_type, tile_n) + cst0 = arith.ConstantOp(xrt_dtype_in, 0.0) for j in range_(c0, cTileN, cVecSize): - sub_a_vec = subview( - l1_a_data.result, - [j], - [VECTOR_SIZE], - [1], - ) - sub_b_vec = subview( - l1_b_data.result, - [j], - [VECTOR_SIZE], - [1], - ) - sub_c_vec = subview( - l1_out_data.result, - [j], - [VECTOR_SIZE], - [1], - ) - cst0 = arith.ConstantOp(xrt_dtype_in, 0.0) - v_a = transfer_read( - VectorType.get([VECTOR_SIZE], xrt_dtype_in), - sub_a_vec, - [c0], - AffineMapAttr.get(AffineMap.get_identity(1)), - cst0, - [True], - ) - v_b = transfer_read( - VectorType.get([VECTOR_SIZE], xrt_dtype_in), - sub_b_vec, - [c0], - AffineMapAttr.get(AffineMap.get_identity(1)), - cst0, - [True], - ) + v_a = vec_read(l1_a_data, j, VECTOR_SIZE, c0, vecTy, cst0, imap) + v_b = vec_read(l1_b_data, j, VECTOR_SIZE, c0, vecTy, cst0, imap) v_c = arith.SubFOp(v_a, v_b) - transfer_write( - None, - v_c, - sub_c_vec, - [c0], - AffineMapAttr.get(AffineMap.get_identity(1)), - [True], - ) + vec_write(v_c, l1_out_data, j, VECTOR_SIZE, c0, imap) yield_([]) dma_memcpy_nd( _l3_c, l1_out_data, - dst_offsets=[ - offset, - ], + dst_offsets=[offset], dst_sizes=[tile_n], dst_strides=[1], ) @@ -165,26 +116,12 @@ def herd_body( if __name__ == "__main__": - # Default values. N = 65536 TILE_N = 1024 VECTOR_SIZE = 16 INPUT_DATATYPE = bfloat16 - parser = argparse.ArgumentParser( - prog="run.py", - description="Builds, runs, and tests the passthrough_dma example", - ) - parser.add_argument( - "-v", - "--verbose", - action="store_true", - ) - parser.add_argument( - "-p", - "--print-module-only", - action="store_true", - ) + parser = make_air_parser("Builds, runs, and tests the passthrough_dma example") parser.add_argument( "--n", type=int, @@ -198,22 +135,6 @@ def herd_body( default=VECTOR_SIZE, help="Vector size for SIMD operations", ) - parser.add_argument( - "--compile-mode", - type=str, - choices=["compile-only", "compile-and-run"], - dest="compile_mode", - default="compile-and-run", - help="Configure to whether to run after compile", - ) - parser.add_argument( - "--output-format", - type=str, - choices=["xclbin", "elf"], - default="xclbin", - dest="output_format", - help="Output format for the compiled binary (default: xclbin)", - ) parser.add_argument( "--bf16-emulation", dest="bf16_emulation", @@ -234,65 +155,24 @@ def herd_body( INPUT_DATATYPE, args.vector_size, ) - if args.print_module_only: - print(mlir_module) - exit(0) + check_print_module(mlir_module, args) input_a = np.arange(0, args.n, dtype=np.int64) input_a = input_a.astype(INPUT_DATATYPE) input_b = np.arange(0, args.n, dtype=np.int64) input_b = input_b.astype(INPUT_DATATYPE) - if args.compile_mode == "compile-and-run": - - # Stochastically sample num_sample results, and pass to XRTRunner backend for verification. - num_samples = 100 - sampled_indices = np.vstack( - [ - np.random.randint(0, args.n, num_samples), # i indices - ] - ) - - # Compute reference results for sampled indices - sampled_values = np.array( - [input_a[i] - input_b[i] for i in zip(*sampled_indices)], - dtype=INPUT_DATATYPE, - ) - - # Store as a dictionary - sampled_data = { - "shape": (args.n,), - "indices": sampled_indices, - "values": sampled_values, - } - - ###### Compile and test - runner = XRTRunner( - verbose=args.verbose, - omit_while_true_loop=False, - output_format=args.output_format, + sampled_data = stochastic_check( + [input_a, input_b], args.n, lambda a, b: a - b, INPUT_DATATYPE + ) + exit( + run_on_npu( + args, + mlir_module, + inputs=[input_a, input_b], instance_name="vector_sub", + stochastic_expected_outputs=[sampled_data], + rtol=5e-2 if bf16_emulation else 1e-3, bf16_emulation=bf16_emulation, - runtime_loop_tiling_sizes=[4, 4], - ) - exit( - runner.run_test( - mlir_module, - inputs=[input_a, input_b], - stochastic_expected_outputs=[sampled_data], - rtol=5e-2 if bf16_emulation else 1e-3, - ) ) - - elif args.compile_mode == "compile-only": - ###### Compile only - backend = XRTBackend( - verbose=args.verbose, - omit_while_true_loop=False, - output_format=args.output_format, - bf16_emulation=bf16_emulation, - runtime_loop_tiling_sizes=[4, 4], - ) - module_function = backend.compile(mlir_module) - - backend.unload() + ) diff --git a/programming_examples/primitives/vector_examples/vector_tanh/vector_tanh.py b/programming_examples/primitives/vector_examples/vector_tanh/vector_tanh.py index d5fae8bda..f3435b0db 100644 --- a/programming_examples/primitives/vector_examples/vector_tanh/vector_tanh.py +++ b/programming_examples/primitives/vector_examples/vector_tanh/vector_tanh.py @@ -12,21 +12,39 @@ Computation is vectorized using vector.transfer_read/write. """ -import argparse +import os +import sys + +sys.path.insert( + 0, + os.path.dirname( + os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + ), +) + import numpy as np from ml_dtypes import bfloat16 from air.ir import * -from air.dialects.affine import apply as affine_apply from air.dialects.air import * from air.dialects import arith, math as math_dialect from air.dialects.arith import ConstantOp -from air.dialects.memref import AllocOp, DeallocOp, subview -from air.dialects.vector import transfer_read, transfer_write +from air.dialects.memref import AllocOp, DeallocOp from air.dialects.func import FuncOp from air.dialects.scf import for_, yield_ -from air.backend.xrt_runner import XRTRunner, type_mapper -from air.backend.xrt import XRTBackend +from air.backend.xrt_runner import type_mapper +from utils import ( + make_l1_memref, + make_vec_type, + identity_map_1d, + tiled_1d_offset, + vec_read, + vec_write, + make_air_parser, + run_on_npu, + stochastic_check, + check_print_module, +) range_ = for_ @@ -41,14 +59,9 @@ def build_module(n, tile_n, np_dtype_in, vector_size=16): index_type = IndexType.get() l3memrefTy = MemRefType.get([n], xrt_dtype_in) - l1MemrefTy = MemRefType.get( - shape=[tile_n], - element_type=xrt_dtype_in, - memory_space=IntegerAttr.get(T.i32(), MemorySpace.L1), - ) - - vecTy = VectorType.get([VECTOR_SIZE], xrt_dtype_in) - identity_map = AffineMapAttr.get(AffineMap.get_identity(1)) + l1MemrefTy = make_l1_memref([tile_n], xrt_dtype_in) + vecTy = make_vec_type(VECTOR_SIZE, xrt_dtype_in) + imap = identity_map_1d() @FuncOp.from_py_func(l3memrefTy, l3memrefTy) def vector_tanh(arg0, arg1): @@ -59,20 +72,7 @@ def herd_body(_tx, _ty, _sx, _sy, _l3_in, _l3_out): l1_out = AllocOp(l1MemrefTy, [], []) for _l_ivx in range_(0, n, tile_n * num_tiles): - offset_map = AffineMap.get( - 0, - 2, - [ - AffineExpr.get_add( - AffineSymbolExpr.get(0), - AffineExpr.get_mul( - AffineSymbolExpr.get(1), - AffineConstantExpr.get(tile_n), - ), - ) - ], - ) - offset = affine_apply(offset_map, [_l_ivx, _ty]) + offset = tiled_1d_offset(_l_ivx, _ty, tile_n) dma_memcpy_nd( l1_in, @@ -88,17 +88,12 @@ def herd_body(_tx, _ty, _sx, _sy, _l3_in, _l3_out): cst0 = arith.ConstantOp(xrt_dtype_in, 0.0) for j in range_(c0, cTileN, cVecSize): - sub_in = subview(l1_in.result, [j], [VECTOR_SIZE], [1]) - sub_out = subview(l1_out.result, [j], [VECTOR_SIZE], [1]) - - v_in = transfer_read( - vecTy, sub_in, [c0], identity_map, cst0, [True] - ) + v_in = vec_read(l1_in, j, VECTOR_SIZE, c0, vecTy, cst0, imap) # Hardware tanh intrinsic on AIE2P v_out = math_dialect.tanh(v_in) - transfer_write(None, v_out, sub_out, [c0], identity_map, [True]) + vec_write(v_out, l1_out, j, VECTOR_SIZE, c0, imap) yield_([]) dma_memcpy_nd( @@ -119,12 +114,7 @@ def herd_body(_tx, _ty, _sx, _sy, _l3_in, _l3_out): VECTOR_SIZE = 16 INPUT_DATATYPE = bfloat16 - parser = argparse.ArgumentParser( - prog="run.py", - description="Builds, runs, and tests the vectorized tanh example", - ) - parser.add_argument("-v", "--verbose", action="store_true") - parser.add_argument("-p", "--print-module-only", action="store_true") + parser = make_air_parser("Builds, runs, and tests the vectorized tanh example") parser.add_argument("--n", type=int, default=N, help="Total number of elements") parser.add_argument("--tile-n", type=int, default=TILE_N, help="Tile size") parser.add_argument( @@ -140,69 +130,28 @@ def herd_body(_tx, _ty, _sx, _sy, _l3_in, _l3_out): default="aie2p", help="Target AIE architecture (aie2 or aie2p)", ) - parser.add_argument( - "--compile-mode", - type=str, - choices=["compile-only", "compile-and-run"], - dest="compile_mode", - default="compile-and-run", - ) - parser.add_argument( - "--output-format", - type=str, - choices=["xclbin", "elf"], - default="xclbin", - dest="output_format", - ) args = parser.parse_args() mlir_module = build_module(args.n, args.tile_n, INPUT_DATATYPE, args.vector_size) - if args.print_module_only: - print(mlir_module) - exit(0) + check_print_module(mlir_module, args) np.random.seed(42) input_a = np.random.uniform(-4.0, 4.0, args.n).astype(INPUT_DATATYPE) - if args.compile_mode == "compile-and-run": - num_samples = 100 - sampled_indices = np.vstack([np.random.randint(0, args.n, num_samples)]) + # Reference: compute tanh in f32 precision + def tanh_ref(x): + return np.tanh(x.astype(np.float32)) - # Reference: compute tanh in f32 precision - sampled_values = np.array( - [np.tanh(input_a[i].astype(np.float32)) for i in zip(*sampled_indices)], - dtype=INPUT_DATATYPE, - ) - sampled_data = { - "shape": (args.n,), - "indices": sampled_indices, - "values": sampled_values, - } - - runner = XRTRunner( - verbose=args.verbose, - omit_while_true_loop=False, - output_format=args.output_format, + sampled_data = stochastic_check([input_a], args.n, tanh_ref, INPUT_DATATYPE) + exit( + run_on_npu( + args, + mlir_module, + inputs=[input_a], instance_name="vector_tanh", - runtime_loop_tiling_sizes=[4, 4], + stochastic_expected_outputs=[sampled_data], + rtol=1e-1, + atol=5e-2, ) - exit( - runner.run_test( - mlir_module, - inputs=[input_a], - stochastic_expected_outputs=[sampled_data], - rtol=1e-1, - atol=5e-2, - ) - ) - - elif args.compile_mode == "compile-only": - backend = XRTBackend( - verbose=args.verbose, - omit_while_true_loop=False, - output_format=args.output_format, - runtime_loop_tiling_sizes=[4, 4], - ) - module_function = backend.compile(mlir_module) - backend.unload() + ) diff --git a/programming_examples/relu/relu.py b/programming_examples/relu/relu.py index 69b23c301..7a4727167 100644 --- a/programming_examples/relu/relu.py +++ b/programming_examples/relu/relu.py @@ -11,23 +11,26 @@ configurable VECTOR_SIZE (default 16). """ -import argparse +import os +import sys + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + import numpy as np np.random.seed(42) from ml_dtypes import bfloat16 from air.ir import * -from air.dialects.affine import apply as affine_apply from air.dialects.air import * from air.dialects import arith from air.dialects.arith import ConstantOp -from air.dialects.memref import AllocOp, DeallocOp, subview -from air.dialects.vector import transfer_read, transfer_write, BroadcastOp +from air.dialects.memref import AllocOp, DeallocOp +from air.dialects.vector import BroadcastOp from air.dialects.func import FuncOp from air.dialects.scf import for_, yield_ -from air.backend.xrt_runner import XRTRunner, type_mapper -from air.backend.xrt import XRTBackend +from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu +from utils import vec_read, vec_write range_ = for_ @@ -41,18 +44,10 @@ def build_module(n, tile_n, np_dtype_in, vector_size=16): VECTOR_SIZE = vector_size index_type = IndexType.get() - # L3 MemRefTypes l3memrefTy = MemRefType.get([n], xrt_dtype_in) - - # L1 MemRefTypes - l1MemrefTy = MemRefType.get( - shape=[tile_n], - element_type=xrt_dtype_in, - memory_space=IntegerAttr.get(T.i32(), MemorySpace.L1), - ) - - vecTy = VectorType.get([VECTOR_SIZE], xrt_dtype_in) - identity_map = AffineMapAttr.get(AffineMap.get_identity(1)) + l1MemrefTy = l1_memref_type([tile_n], xrt_dtype_in) + vecTy = vec_type(VECTOR_SIZE, xrt_dtype_in) + imap = identity_map_attr() @FuncOp.from_py_func(l3memrefTy, l3memrefTy) def relu(arg0, arg1): @@ -73,21 +68,7 @@ def herd_body( l1_out_data = AllocOp(l1MemrefTy, [], []) for _l_ivx in range_(0, n, tile_n * num_tiles): - - offset_map = AffineMap.get( - 0, - 2, - [ - AffineExpr.get_add( - AffineSymbolExpr.get(0), - AffineExpr.get_mul( - AffineSymbolExpr.get(1), - AffineConstantExpr.get(tile_n), - ), - ) - ], - ) - offset = affine_apply(offset_map, [_l_ivx, _ty]) + offset = tile_offset_1d(_l_ivx, _ty, tile_n) dma_memcpy_nd( l1_in_data, @@ -104,24 +85,10 @@ def herd_body( v_zero = BroadcastOp(vecTy, cst0) for j in range_(c0, cTileN, cVecSize): - sub_in = subview( - l1_in_data.result, - [j], - [VECTOR_SIZE], - [1], - ) - sub_out = subview( - l1_out_data.result, - [j], - [VECTOR_SIZE], - [1], - ) - v_in = transfer_read( - vecTy, sub_in, [c0], identity_map, cst0, [True] - ) + v_in = vec_read(l1_in_data, j, VECTOR_SIZE, c0, vecTy, cst0, imap) # RELU: max(x, 0) using arith.maximumf on bf16 v_relu = arith.MaximumFOp(v_in, v_zero) - transfer_write(None, v_relu, sub_out, [c0], identity_map, [True]) + vec_write(v_relu, l1_out_data, j, VECTOR_SIZE, c0, imap) yield_([]) dma_memcpy_nd( @@ -142,12 +109,7 @@ def herd_body( TILE_N = 1024 INPUT_DATATYPE = bfloat16 - parser = argparse.ArgumentParser( - prog="run.py", - description="Builds, runs, and tests the RELU example", - ) - parser.add_argument("-v", "--verbose", action="store_true") - parser.add_argument("-p", "--print-module-only", action="store_true") + parser = make_air_parser("Builds, runs, and tests the RELU example") parser.add_argument("--n", type=int, default=N, help="Total number of elements") parser.add_argument("--tile-n", type=int, default=TILE_N, help="Tile size") parser.add_argument( @@ -156,20 +118,6 @@ def herd_body( default=16, help="Vector size for SIMD operations", ) - parser.add_argument( - "--compile-mode", - type=str, - choices=["compile-only", "compile-and-run"], - dest="compile_mode", - default="compile-and-run", - ) - parser.add_argument( - "--output-format", - type=str, - choices=["xclbin", "elf"], - default="xclbin", - dest="output_format", - ) args = parser.parse_args() @@ -181,41 +129,24 @@ def herd_body( # Mix of positive and negative values for RELU testing input_a = np.random.randn(args.n).astype(INPUT_DATATYPE) - if args.compile_mode == "compile-and-run": - num_samples = 100 - sampled_indices = np.vstack([np.random.randint(0, args.n, num_samples)]) - sampled_values = np.array( - [np.maximum(input_a[i], 0) for i in zip(*sampled_indices)], - dtype=INPUT_DATATYPE, - ) - sampled_data = { - "shape": (args.n,), - "indices": sampled_indices, - "values": sampled_values, - } - - runner = XRTRunner( - verbose=args.verbose, - omit_while_true_loop=False, - output_format=args.output_format, + sampled_indices = np.vstack([np.random.randint(0, args.n, 100)]) + sampled_values = np.array( + [np.maximum(input_a[i], 0) for i in zip(*sampled_indices)], + dtype=INPUT_DATATYPE, + ) + sampled_data = { + "shape": (args.n,), + "indices": sampled_indices, + "values": sampled_values, + } + + exit( + run_on_npu( + args, + mlir_module, + inputs=[input_a], instance_name="relu", - runtime_loop_tiling_sizes=[4, 4], + stochastic_expected_outputs=[sampled_data], + rtol=1e-2, ) - exit( - runner.run_test( - mlir_module, - inputs=[input_a], - stochastic_expected_outputs=[sampled_data], - rtol=1e-2, - ) - ) - - elif args.compile_mode == "compile-only": - backend = XRTBackend( - verbose=args.verbose, - omit_while_true_loop=False, - output_format=args.output_format, - runtime_loop_tiling_sizes=[4, 4], - ) - module_function = backend.compile(mlir_module) - backend.unload() + ) diff --git a/programming_examples/rms_norm/rms_norm.py b/programming_examples/rms_norm/rms_norm.py index 96fab5328..4e52dfaf2 100644 --- a/programming_examples/rms_norm/rms_norm.py +++ b/programming_examples/rms_norm/rms_norm.py @@ -13,7 +13,7 @@ configurable VECTOR_SIZE (default 16 for AIE2). """ -import argparse +import numpy as np from ml_dtypes import bfloat16 from air.ir import * @@ -28,8 +28,7 @@ ) from air.dialects.func import FuncOp from air.dialects.scf import for_, yield_ -from air.backend.xrt_runner import XRTRunner, type_mapper -from air.backend.xrt import XRTBackend +from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu range_ = for_ @@ -43,16 +42,15 @@ def build_module(M, N, np_dtype, vector_size=16): N % vector_size == 0 ), f"N ({N}) must be divisible by vector_size ({vector_size})" - vecTy = VectorType.get([vector_size], xrt_dtype) - identity_map = AffineMapAttr.get(AffineMap.get_identity(1)) + vecTy = vec_type(vector_size, xrt_dtype) + identity_map = identity_map_attr() # L3 types l3MemrefTy = MemRefType.get([M, N], xrt_dtype) # L1 types - l1_mem_space = IntegerAttr.get(T.i32(), MemorySpace.L1) - l1RowTy = MemRefType.get([N], xrt_dtype, memory_space=l1_mem_space) - l1VecTy = MemRefType.get([vector_size], xrt_dtype, memory_space=l1_mem_space) + l1RowTy = l1_memref_type([N], xrt_dtype) + l1VecTy = l1_memref_type([vector_size], xrt_dtype) @FuncOp.from_py_func(l3MemrefTy, l3MemrefTy) def rms_norm(arg0, arg1): @@ -155,12 +153,7 @@ def herd_body(_tx, _ty, _sx, _sy, l3_in, l3_out): VECTOR_SIZE = 16 INPUT_DATATYPE = bfloat16 - parser = argparse.ArgumentParser( - prog="run.py", - description="Builds, runs, and tests the RMS normalization example", - ) - parser.add_argument("-v", "--verbose", action="store_true") - parser.add_argument("-p", "--print-module-only", action="store_true") + parser = make_air_parser("Builds, runs, and tests the RMS normalization example") parser.add_argument("--M", type=int, default=M_DEFAULT, help="M dimension (rows)") parser.add_argument("--N", type=int, default=N_DEFAULT, help="N dimension (cols)") parser.add_argument( @@ -169,20 +162,6 @@ def herd_body(_tx, _ty, _sx, _sy, l3_in, l3_out): default=VECTOR_SIZE, help="Vector size for SIMD operations", ) - parser.add_argument( - "--compile-mode", - type=str, - choices=["compile-only", "compile-and-run"], - dest="compile_mode", - default="compile-and-run", - ) - parser.add_argument( - "--output-format", - type=str, - choices=["xclbin", "elf"], - default="xclbin", - dest="output_format", - ) args = parser.parse_args() mlir_module = build_module(args.M, args.N, INPUT_DATATYPE, args.vector_size) @@ -200,30 +179,14 @@ def herd_body(_tx, _ty, _sx, _sy, l3_in, l3_out): ) y_expected = (x_input / rms).astype(INPUT_DATATYPE) - if args.compile_mode == "compile-and-run": - runner = XRTRunner( - verbose=args.verbose, - omit_while_true_loop=False, - output_format=args.output_format, + exit( + run_on_npu( + args, + mlir_module, + inputs=[x_input], instance_name="rms_norm", - runtime_loop_tiling_sizes=[4, 4], + expected_outputs=[y_expected], + rtol=5e-2, + atol=5e-1, ) - exit( - runner.run_test( - mlir_module, - inputs=[x_input], - expected_outputs=[y_expected], - rtol=5e-2, - atol=5e-1, - ) - ) - - elif args.compile_mode == "compile-only": - backend = XRTBackend( - verbose=args.verbose, - omit_while_true_loop=False, - output_format=args.output_format, - runtime_loop_tiling_sizes=[4, 4], - ) - module_function = backend.compile(mlir_module) - backend.unload() + ) diff --git a/programming_examples/rope_lut/rope_lut.py b/programming_examples/rope_lut/rope_lut.py index 04c90994b..eaa1d481f 100644 --- a/programming_examples/rope_lut/rope_lut.py +++ b/programming_examples/rope_lut/rope_lut.py @@ -16,7 +16,6 @@ Uses a single AIE tile with DMA transfers between L3 and L1 memory. """ -import argparse import numpy as np from ml_dtypes import bfloat16 @@ -27,8 +26,7 @@ from air.dialects.memref import AllocOp, DeallocOp from air.dialects.func import FuncOp, CallOp from air.dialects.scf import for_, yield_ -from air.backend.xrt_runner import XRTRunner, type_mapper -from air.backend.xrt import XRTBackend +from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu range_ = for_ @@ -110,30 +108,11 @@ def herd_body(_tx, _ty, _sx, _sy, l3_in, l3_lut, l3_out): INPUT_DATATYPE = bfloat16 THETA = 10000.0 - parser = argparse.ArgumentParser( - prog="run.py", - description="Builds, runs, and tests the RoPE (LUT-based) example", - ) - parser.add_argument("-v", "--verbose", action="store_true") - parser.add_argument("-p", "--print-module-only", action="store_true") + parser = make_air_parser("Builds, runs, and tests the RoPE (LUT-based) example") parser.add_argument("--seq-len", type=int, default=SEQ_LEN, help="Sequence length") parser.add_argument( "--embed-dim", type=int, default=EMBED_DIM, help="Embedding dimension" ) - parser.add_argument( - "--compile-mode", - type=str, - choices=["compile-only", "compile-and-run"], - dest="compile_mode", - default="compile-and-run", - ) - parser.add_argument( - "--output-format", - type=str, - choices=["xclbin", "elf"], - default="xclbin", - dest="output_format", - ) args = parser.parse_args() mlir_module = build_module(args.seq_len, args.embed_dim, INPUT_DATATYPE) @@ -160,44 +139,28 @@ def herd_body(_tx, _ty, _sx, _sy, l3_in, l3_lut, l3_out): lut[r, 2 * i + 1] = np.sin(angle) lut = lut.astype(INPUT_DATATYPE) - if args.compile_mode == "compile-and-run": - # Compute reference output - ref = np.copy(input_data).astype(np.float32) - input_f32 = input_data.astype(np.float32) - lut_f32 = lut.astype(np.float32) - for r in range(seq_len): - for i in range(embed_dim // 2): - cos_v = lut_f32[r, 2 * i] - sin_v = lut_f32[r, 2 * i + 1] - x0 = input_f32[r, 2 * i] - x1 = input_f32[r, 2 * i + 1] - ref[r, 2 * i] = x0 * cos_v - x1 * sin_v - ref[r, 2 * i + 1] = x0 * sin_v + x1 * cos_v - ref_flat = ref.flatten().astype(INPUT_DATATYPE) - - runner = XRTRunner( - verbose=args.verbose, - omit_while_true_loop=False, - output_format=args.output_format, + # Compute reference output + ref = np.copy(input_data).astype(np.float32) + input_f32 = input_data.astype(np.float32) + lut_f32 = lut.astype(np.float32) + for r in range(seq_len): + for i in range(embed_dim // 2): + cos_v = lut_f32[r, 2 * i] + sin_v = lut_f32[r, 2 * i + 1] + x0 = input_f32[r, 2 * i] + x1 = input_f32[r, 2 * i + 1] + ref[r, 2 * i] = x0 * cos_v - x1 * sin_v + ref[r, 2 * i + 1] = x0 * sin_v + x1 * cos_v + ref_flat = ref.flatten().astype(INPUT_DATATYPE) + + exit( + run_on_npu( + args, + mlir_module, + inputs=[input_data.flatten(), lut.flatten()], instance_name="rope", - runtime_loop_tiling_sizes=[4, 4], - ) - exit( - runner.run_test( - mlir_module, - inputs=[input_data.flatten(), lut.flatten()], - expected_outputs=[ref_flat], - rtol=5e-2, - atol=5e-2, - ) + expected_outputs=[ref_flat], + rtol=5e-2, + atol=5e-2, ) - - elif args.compile_mode == "compile-only": - backend = XRTBackend( - verbose=args.verbose, - omit_while_true_loop=False, - output_format=args.output_format, - runtime_loop_tiling_sizes=[4, 4], - ) - module_function = backend.compile(mlir_module) - backend.unload() + ) diff --git a/programming_examples/rope_sincos/rope_sincos.py b/programming_examples/rope_sincos/rope_sincos.py index a398a7651..4fcd9d48f 100644 --- a/programming_examples/rope_sincos/rope_sincos.py +++ b/programming_examples/rope_sincos/rope_sincos.py @@ -21,7 +21,6 @@ XFAIL on Peano. See rope_lut/ for a Peano-compatible alternative. """ -import argparse import numpy as np from math import cos, sin from ml_dtypes import bfloat16 @@ -33,8 +32,7 @@ from air.dialects.memref import AllocOp, DeallocOp from air.dialects.func import FuncOp, CallOp from air.dialects.scf import for_, yield_ -from air.backend.xrt_runner import XRTRunner, type_mapper -from air.backend.xrt import XRTBackend +from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu range_ = for_ @@ -193,12 +191,9 @@ def herd_body(_tx, _ty, _sx, _sy, _l3_in, _l3_out): HERD_N = 4 INPUT_DATATYPE = bfloat16 - parser = argparse.ArgumentParser( - prog="run.py", - description="Builds, runs, and tests the RoPE (on-chip sin/cos) example", + parser = make_air_parser( + "Builds, runs, and tests the RoPE (on-chip sin/cos) example" ) - parser.add_argument("-v", "--verbose", action="store_true") - parser.add_argument("-p", "--print-module-only", action="store_true") parser.add_argument("--head-size", type=int, default=HEAD_SIZE, help="Head size") parser.add_argument( "--num-heads", type=int, default=NUM_HEADS, help="Number of heads" @@ -209,20 +204,6 @@ def herd_body(_tx, _ty, _sx, _sy, _l3_in, _l3_out): default=HERD_N, help="Number of L1 tiles along the N dimension", ) - parser.add_argument( - "--compile-mode", - type=str, - choices=["compile-only", "compile-and-run"], - dest="compile_mode", - default="compile-and-run", - ) - parser.add_argument( - "--output-format", - type=str, - choices=["xclbin", "elf"], - default="xclbin", - dest="output_format", - ) args = parser.parse_args() mlir_module = build_module( @@ -258,29 +239,13 @@ def herd_body(_tx, _ty, _sx, _sy, _l3_in, _l3_out): outputs[i][s + args.head_size] = v0 * fcr - v1 * fci outputs[i][s + args.head_size + 1] = v0 * fci + v1 * fcr - if args.compile_mode == "compile-and-run": - runner = XRTRunner( - verbose=args.verbose, - omit_while_true_loop=False, - output_format=args.output_format, + exit( + run_on_npu( + args, + mlir_module, + inputs=[inputs], instance_name="rope", - runtime_loop_tiling_sizes=[4, 4], + expected_outputs=[outputs], + rtol=1e1, ) - exit( - runner.run_test( - mlir_module, - inputs=[inputs], - expected_outputs=[outputs], - rtol=1e1, - ) - ) - - elif args.compile_mode == "compile-only": - backend = XRTBackend( - verbose=args.verbose, - omit_while_true_loop=False, - output_format=args.output_format, - runtime_loop_tiling_sizes=[4, 4], - ) - module_function = backend.compile(mlir_module) - backend.unload() + ) diff --git a/programming_examples/segment_alloc/segment_alloc.py b/programming_examples/segment_alloc/segment_alloc.py index 3ad90ff6b..3cea52a2a 100644 --- a/programming_examples/segment_alloc/segment_alloc.py +++ b/programming_examples/segment_alloc/segment_alloc.py @@ -1,6 +1,5 @@ # Copyright (C) 2024, Advanced Micro Devices, Inc. # SPDX-License-Identifier: MIT -import argparse import numpy as np from air.ir import * @@ -8,7 +7,7 @@ from air.dialects.memref import AllocOp, DeallocOp, load, store from air.dialects.func import FuncOp from air.dialects.scf import for_, yield_ -from air.backend.xrt_runner import XRTRunner, type_mapper +from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu range_ = for_ @@ -42,15 +41,8 @@ def launch_body(a, b): # The arguments are still the input and the output @segment(name="seg", operands=[a, b]) def segment_body(arg2, arg3): - # We want to store our data in L1 memory - mem_space_l2 = IntegerAttr.get(T.i32(), MemorySpace.L2) - # This is the type definition of the tile - tile_type_l2 = MemRefType.get( - shape=TILE_SIZE, - element_type=xrt_dtype, - memory_space=mem_space_l2, - ) + tile_type_l2 = l2_memref_type(TILE_SIZE, xrt_dtype) # We must allocate a buffer of tile size for the input/output tile_in_l2 = AllocOp(tile_type_l2, [], []) @@ -60,15 +52,8 @@ def segment_body(arg2, arg3): @herd(name="copyherd", sizes=[1, 1], operands=[arg2, arg3, tile_in_l2]) def herd_body(tx, ty, sx, sy, a, b, my_l2_tile): - # We want to store our data in L1 memory - mem_space_l1 = IntegerAttr.get(T.i32(), MemorySpace.L1) - # This is the type definition of the tile - tile_type_l1 = MemRefType.get( - shape=TILE_SIZE, - element_type=xrt_dtype, - memory_space=mem_space_l1, - ) + tile_type_l1 = l1_memref_type(TILE_SIZE, xrt_dtype) # We must allocate a buffer of tile size for the input/output tile_in_l1 = AllocOp(tile_type_l1, [], []) @@ -114,29 +99,7 @@ def herd_body(tx, ty, sx, sy, a, b, my_l2_tile): if __name__ == "__main__": - parser = argparse.ArgumentParser( - prog="run.py", - description="Builds, runs, and tests the segment_alloc example", - ) - parser.add_argument( - "-v", - "--verbose", - action="store_true", - ) - parser.add_argument( - "-p", - "--print-module-only", - action="store_true", - ) - parser.add_argument( - "--output-format", - type=str, - choices=["xclbin", "elf"], - default="xclbin", - dest="output_format", - help="Output format for the compiled binary (default: xclbin)", - ) - + parser = make_air_parser("Builds, runs, and tests the segment_alloc example") args = parser.parse_args() mlir_module = build_module() @@ -150,10 +113,11 @@ def herd_body(tx, ty, sx, sy, a, b, my_l2_tile): for w in range(TILE_WIDTH): output_b[h, w] = input_a[h, w] - runner = XRTRunner( - verbose=args.verbose, - output_format=args.output_format, + run_on_npu( + args, + mlir_module, + inputs=[input_a], + expected_outputs=[output_b], instance_name="copy", runtime_loop_tiling_sizes=[4, 4], ) - exit(runner.run_test(mlir_module, inputs=[input_a], expected_outputs=[output_b])) diff --git a/programming_examples/segment_unroll/segment_unroll.py b/programming_examples/segment_unroll/segment_unroll.py index 7eb5d2311..7d64f8046 100644 --- a/programming_examples/segment_unroll/segment_unroll.py +++ b/programming_examples/segment_unroll/segment_unroll.py @@ -13,7 +13,6 @@ input data using channels indexed by segment coordinates. """ -import argparse import numpy as np from air.ir import * @@ -22,7 +21,7 @@ from air.dialects.func import FuncOp from air.dialects.scf import for_, yield_ from air.dialects import arith -from air.backend.xrt_runner import XRTRunner, type_mapper +from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu range_ = for_ @@ -53,12 +52,7 @@ def build_module(): memrefTyInOut = T.memref(VECTOR_LEN, xrt_dtype) # L1 memory space for tile data - mem_space_l1 = IntegerAttr.get(T.i32(), MemorySpace.L1) - image_type_l1 = MemRefType.get( - shape=[VECTOR_LEN // SEGMENT_SIZE_X], - element_type=xrt_dtype, - memory_space=mem_space_l1, - ) + image_type_l1 = l1_memref_type([VECTOR_LEN // SEGMENT_SIZE_X], xrt_dtype) # Define channels for data movement with dimensions matching segment unroll # Each unrolled segment instance needs its own channel endpoint @@ -131,31 +125,7 @@ def herd_body(tx, ty, sx, sy, herd_seg_x, herd_seg_y): if __name__ == "__main__": - parser = argparse.ArgumentParser( - prog="segment_unroll.py", - description="Builds, runs, and tests the segment unroll example", - ) - parser.add_argument( - "-v", - "--verbose", - action="store_true", - help="Enable verbose output", - ) - parser.add_argument( - "-p", - "--print-module-only", - action="store_true", - help="Print the generated MLIR module and exit", - ) - parser.add_argument( - "--output-format", - type=str, - choices=["xclbin", "elf"], - default="xclbin", - dest="output_format", - help="Output format for the compiled binary (default: xclbin)", - ) - + parser = make_air_parser("Builds, runs, and tests the segment unroll example") args = parser.parse_args() mlir_module = build_module() @@ -169,10 +139,11 @@ def herd_body(tx, ty, sx, sy, herd_seg_x, herd_seg_y): input_a = np.arange(VECTOR_LEN, dtype=INOUT_DATATYPE) output_b = input_a + 10 - runner = XRTRunner( - verbose=args.verbose, - output_format=args.output_format, + run_on_npu( + args, + mlir_module, + inputs=[input_a], + expected_outputs=[output_b], instance_name="segment_unroll_test", runtime_loop_tiling_sizes=[4, 4], ) - exit(runner.run_test(mlir_module, inputs=[input_a], expected_outputs=[output_b])) diff --git a/programming_examples/shared_l1/run.py b/programming_examples/shared_l1/run.py index c29d6680c..eabd8c446 100644 --- a/programming_examples/shared_l1/run.py +++ b/programming_examples/shared_l1/run.py @@ -33,7 +33,7 @@ from air.dialects.scf import for_, yield_ from air.dialects.arith import ConstantOp from air.dialects.memref import AllocOp -from air.backend.xrt_runner import XRTRunner +from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu from ml_dtypes import bfloat16 # Constants for buffer sizes @@ -72,6 +72,13 @@ def parse_args(): dest="output_format", help="Output format for the compiled binary (default: xclbin)", ) + parser.add_argument( + "--compile-mode", + type=str, + choices=["compile-only", "compile-and-run"], + dest="compile_mode", + default="compile-and-run", + ) args = parser.parse_args() return args @@ -402,21 +409,16 @@ def main(): A = np.random.rand(M_SIZE, N_SIZE).astype(bfloat16) C = (A + 3.0).astype(bfloat16) - # Run the module using XRTRunner - runner = XRTRunner( - omit_while_true_loop=False, - verbose=False, - runtime_loop_tiling_sizes=[1, 1], - output_format=args.output_format, - instance_name="func1", - debug_ir=True, - ) exit( - runner.run_test( + run_on_npu( + args, mlir_module, inputs=[A], + instance_name="func1", expected_outputs=[C], rtol=1e-2, + runtime_loop_tiling_sizes=[1, 1], + debug_ir=True, ) ) diff --git a/programming_examples/shim_dma_2d/run.py b/programming_examples/shim_dma_2d/run.py index b10217e47..6ff41e880 100644 --- a/programming_examples/shim_dma_2d/run.py +++ b/programming_examples/shim_dma_2d/run.py @@ -3,46 +3,35 @@ # Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved. # SPDX-License-Identifier: MIT -import argparse import numpy as np -from air.backend.xrt_runner import XRTRunner +from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu from shim_dma_2d import * INOUT_DATATYPE = np.int32 if __name__ == "__main__": - parser = argparse.ArgumentParser( - prog="run.py", - description="Builds, runs, and tests the shim_dma_2d example", - ) - parser.add_argument( - "-v", - "--verbose", - action="store_true", - ) - parser.add_argument( - "--output-format", - type=str, - choices=["xclbin", "elf"], - default="xclbin", - dest="output_format", - help="Output format for the compiled binary (default: xclbin)", - ) + parser = make_air_parser("Builds, runs, and tests the shim_dma_2d example") args = parser.parse_args() mlir_module = build_module() + if args.print_module_only: + print(mlir_module) + exit(0) + input_a = np.arange(np.prod(IMAGE_SIZE), dtype=INOUT_DATATYPE).reshape(IMAGE_SIZE) output_b = np.zeros(shape=IMAGE_SIZE, dtype=INOUT_DATATYPE) for h in range(TILE_HEIGHT): for w in range(TILE_WIDTH): output_b[h, w] = input_a[h, w] - runner = XRTRunner( - verbose=args.verbose, - output_format=args.output_format, - instance_name="copy", - runtime_loop_tiling_sizes=[4, 4], + exit( + run_on_npu( + args, + mlir_module, + inputs=[input_a], + instance_name="copy", + expected_outputs=[output_b], + ) ) - exit(runner.run_test(mlir_module, inputs=[input_a], expected_outputs=[output_b])) diff --git a/programming_examples/sigmoid/sigmoid.py b/programming_examples/sigmoid/sigmoid.py index 54811f1ff..89b5b3948 100644 --- a/programming_examples/sigmoid/sigmoid.py +++ b/programming_examples/sigmoid/sigmoid.py @@ -14,21 +14,24 @@ Computation is vectorized using vector.transfer_read/write. """ -import argparse +import os +import sys + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + import numpy as np from ml_dtypes import bfloat16 from air.ir import * -from air.dialects.affine import apply as affine_apply from air.dialects.air import * from air.dialects import arith, math as math_dialect from air.dialects.arith import ConstantOp -from air.dialects.memref import AllocOp, DeallocOp, subview -from air.dialects.vector import transfer_read, transfer_write, BroadcastOp +from air.dialects.memref import AllocOp, DeallocOp +from air.dialects.vector import BroadcastOp from air.dialects.func import FuncOp from air.dialects.scf import for_, yield_ -from air.backend.xrt_runner import XRTRunner, type_mapper -from air.backend.xrt import XRTBackend +from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu +from utils import vec_read, vec_write range_ = for_ @@ -43,14 +46,9 @@ def build_module(n, tile_n, np_dtype_in, vector_size=16): index_type = IndexType.get() l3memrefTy = MemRefType.get([n], xrt_dtype_in) - l1MemrefTy = MemRefType.get( - shape=[tile_n], - element_type=xrt_dtype_in, - memory_space=IntegerAttr.get(T.i32(), MemorySpace.L1), - ) - - vecTy = VectorType.get([VECTOR_SIZE], xrt_dtype_in) - identity_map = AffineMapAttr.get(AffineMap.get_identity(1)) + l1MemrefTy = l1_memref_type([tile_n], xrt_dtype_in) + vecTy = vec_type(VECTOR_SIZE, xrt_dtype_in) + imap = identity_map_attr() @FuncOp.from_py_func(l3memrefTy, l3memrefTy) def sigmoid(arg0, arg1): @@ -61,20 +59,7 @@ def herd_body(_tx, _ty, _sx, _sy, _l3_in, _l3_out): l1_out = AllocOp(l1MemrefTy, [], []) for _l_ivx in range_(0, n, tile_n * num_tiles): - offset_map = AffineMap.get( - 0, - 2, - [ - AffineExpr.get_add( - AffineSymbolExpr.get(0), - AffineExpr.get_mul( - AffineSymbolExpr.get(1), - AffineConstantExpr.get(tile_n), - ), - ) - ], - ) - offset = affine_apply(offset_map, [_l_ivx, _ty]) + offset = tile_offset_1d(_l_ivx, _ty, tile_n) dma_memcpy_nd( l1_in, @@ -94,10 +79,7 @@ def herd_body(_tx, _ty, _sx, _sy, _l3_in, _l3_out): v_one = BroadcastOp(vecTy, one_const) for j in range_(c0, cTileN, cVecSize): - sub_in = subview(l1_in.result, [j], [VECTOR_SIZE], [1]) - sub_out = subview(l1_out.result, [j], [VECTOR_SIZE], [1]) - - v_x = transfer_read(vecTy, sub_in, [c0], identity_map, cst0, [True]) + v_x = vec_read(l1_in, j, VECTOR_SIZE, c0, vecTy, cst0, imap) # sigmoid(x) = 0.5 * (tanh(x/2) + 1) # Uses hardware tanh intrinsic — no exp or division needed. @@ -106,7 +88,7 @@ def herd_body(_tx, _ty, _sx, _sy, _l3_in, _l3_out): v_tanh_plus_one = arith.addf(v_tanh, v_one.result) v_sigmoid = arith.mulf(v_tanh_plus_one, v_half.result) - transfer_write(None, v_sigmoid, sub_out, [c0], identity_map, [True]) + vec_write(v_sigmoid, l1_out, j, VECTOR_SIZE, c0, imap) yield_([]) dma_memcpy_nd( @@ -126,31 +108,12 @@ def herd_body(_tx, _ty, _sx, _sy, _l3_in, _l3_out): TILE_N = 1024 INPUT_DATATYPE = bfloat16 - parser = argparse.ArgumentParser( - prog="run.py", - description="Builds, runs, and tests the Sigmoid example", - ) - parser.add_argument("-v", "--verbose", action="store_true") - parser.add_argument("-p", "--print-module-only", action="store_true") + parser = make_air_parser("Builds, runs, and tests the Sigmoid example") parser.add_argument("--n", type=int, default=N, help="Total number of elements") parser.add_argument("--tile-n", type=int, default=TILE_N, help="Tile size") parser.add_argument( "--vector-size", type=int, default=16, help="Vector size for SIMD operations" ) - parser.add_argument( - "--compile-mode", - type=str, - choices=["compile-only", "compile-and-run"], - dest="compile_mode", - default="compile-and-run", - ) - parser.add_argument( - "--output-format", - type=str, - choices=["xclbin", "elf"], - default="xclbin", - dest="output_format", - ) args = parser.parse_args() mlir_module = build_module(args.n, args.tile_n, INPUT_DATATYPE, args.vector_size) @@ -161,48 +124,30 @@ def herd_body(_tx, _ty, _sx, _sy, _l3_in, _l3_out): np.random.seed(0) input_a = np.random.uniform(-4.0, 4.0, args.n).astype(INPUT_DATATYPE) - if args.compile_mode == "compile-and-run": - num_samples = 100 - sampled_indices = np.vstack([np.random.randint(0, args.n, num_samples)]) - - # Sigmoid reference using tanh-based identity (matches hardware computation) - def sigmoid_ref(x): - x_f32 = x.astype(np.float32) - return 0.5 * (np.tanh(x_f32 / 2.0) + 1.0) + # Sigmoid reference using tanh-based identity (matches hardware computation) + def sigmoid_ref(x): + x_f32 = x.astype(np.float32) + return 0.5 * (np.tanh(x_f32 / 2.0) + 1.0) - sampled_values = np.array( - [sigmoid_ref(input_a[i]) for i in zip(*sampled_indices)], - dtype=INPUT_DATATYPE, - ) - sampled_data = { - "shape": (args.n,), - "indices": sampled_indices, - "values": sampled_values, - } - - runner = XRTRunner( - verbose=args.verbose, - omit_while_true_loop=False, - output_format=args.output_format, + sampled_indices = np.vstack([np.random.randint(0, args.n, 100)]) + sampled_values = np.array( + [sigmoid_ref(input_a[i]) for i in zip(*sampled_indices)], + dtype=INPUT_DATATYPE, + ) + sampled_data = { + "shape": (args.n,), + "indices": sampled_indices, + "values": sampled_values, + } + + exit( + run_on_npu( + args, + mlir_module, + inputs=[input_a], instance_name="sigmoid", - runtime_loop_tiling_sizes=[4, 4], + stochastic_expected_outputs=[sampled_data], + rtol=1e-1, + atol=5e-2, ) - exit( - runner.run_test( - mlir_module, - inputs=[input_a], - stochastic_expected_outputs=[sampled_data], - rtol=1e-1, - atol=5e-2, - ) - ) - - elif args.compile_mode == "compile-only": - backend = XRTBackend( - verbose=args.verbose, - omit_while_true_loop=False, - output_format=args.output_format, - runtime_loop_tiling_sizes=[4, 4], - ) - module_function = backend.compile(mlir_module) - backend.unload() + ) diff --git a/programming_examples/silu/silu.py b/programming_examples/silu/silu.py index cf44bcfe6..f201b8864 100644 --- a/programming_examples/silu/silu.py +++ b/programming_examples/silu/silu.py @@ -14,21 +14,24 @@ Computation is vectorized using vector.transfer_read/write. """ -import argparse +import os +import sys + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + import numpy as np from ml_dtypes import bfloat16 from air.ir import * -from air.dialects.affine import apply as affine_apply from air.dialects.air import * from air.dialects import arith, math as math_dialect from air.dialects.arith import ConstantOp -from air.dialects.memref import AllocOp, DeallocOp, subview -from air.dialects.vector import transfer_read, transfer_write, BroadcastOp +from air.dialects.memref import AllocOp, DeallocOp +from air.dialects.vector import BroadcastOp from air.dialects.func import FuncOp from air.dialects.scf import for_, yield_ -from air.backend.xrt_runner import XRTRunner, type_mapper -from air.backend.xrt import XRTBackend +from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu +from utils import vec_read, vec_write range_ = for_ @@ -43,14 +46,9 @@ def build_module(n, tile_n, np_dtype_in, vector_size=16): index_type = IndexType.get() l3memrefTy = MemRefType.get([n], xrt_dtype_in) - l1MemrefTy = MemRefType.get( - shape=[tile_n], - element_type=xrt_dtype_in, - memory_space=IntegerAttr.get(T.i32(), MemorySpace.L1), - ) - - vecTy = VectorType.get([VECTOR_SIZE], xrt_dtype_in) - identity_map = AffineMapAttr.get(AffineMap.get_identity(1)) + l1MemrefTy = l1_memref_type([tile_n], xrt_dtype_in) + vecTy = vec_type(VECTOR_SIZE, xrt_dtype_in) + imap = identity_map_attr() @FuncOp.from_py_func(l3memrefTy, l3memrefTy) def silu(arg0, arg1): @@ -61,20 +59,7 @@ def herd_body(_tx, _ty, _sx, _sy, _l3_in, _l3_out): l1_out = AllocOp(l1MemrefTy, [], []) for _l_ivx in range_(0, n, tile_n * num_tiles): - offset_map = AffineMap.get( - 0, - 2, - [ - AffineExpr.get_add( - AffineSymbolExpr.get(0), - AffineExpr.get_mul( - AffineSymbolExpr.get(1), - AffineConstantExpr.get(tile_n), - ), - ) - ], - ) - offset = affine_apply(offset_map, [_l_ivx, _ty]) + offset = tile_offset_1d(_l_ivx, _ty, tile_n) dma_memcpy_nd( l1_in, @@ -94,10 +79,7 @@ def herd_body(_tx, _ty, _sx, _sy, _l3_in, _l3_out): v_one = BroadcastOp(vecTy, one_const) for j in range_(c0, cTileN, cVecSize): - sub_in = subview(l1_in.result, [j], [VECTOR_SIZE], [1]) - sub_out = subview(l1_out.result, [j], [VECTOR_SIZE], [1]) - - v_x = transfer_read(vecTy, sub_in, [c0], identity_map, cst0, [True]) + v_x = vec_read(l1_in, j, VECTOR_SIZE, c0, vecTy, cst0, imap) # SiLU(x) = x * sigmoid(x) # = x * 0.5 * (tanh(x/2) + 1) @@ -108,7 +90,7 @@ def herd_body(_tx, _ty, _sx, _sy, _l3_in, _l3_out): v_sigmoid = arith.mulf(v_tanh_plus_one, v_half.result) v_silu = arith.mulf(v_x, v_sigmoid) - transfer_write(None, v_silu, sub_out, [c0], identity_map, [True]) + vec_write(v_silu, l1_out, j, VECTOR_SIZE, c0, imap) yield_([]) dma_memcpy_nd( @@ -128,31 +110,12 @@ def herd_body(_tx, _ty, _sx, _sy, _l3_in, _l3_out): TILE_N = 1024 INPUT_DATATYPE = bfloat16 - parser = argparse.ArgumentParser( - prog="run.py", - description="Builds, runs, and tests the SiLU example", - ) - parser.add_argument("-v", "--verbose", action="store_true") - parser.add_argument("-p", "--print-module-only", action="store_true") + parser = make_air_parser("Builds, runs, and tests the SiLU example") parser.add_argument("--n", type=int, default=N, help="Total number of elements") parser.add_argument("--tile-n", type=int, default=TILE_N, help="Tile size") parser.add_argument( "--vector-size", type=int, default=16, help="Vector size for SIMD operations" ) - parser.add_argument( - "--compile-mode", - type=str, - choices=["compile-only", "compile-and-run"], - dest="compile_mode", - default="compile-and-run", - ) - parser.add_argument( - "--output-format", - type=str, - choices=["xclbin", "elf"], - default="xclbin", - dest="output_format", - ) args = parser.parse_args() mlir_module = build_module(args.n, args.tile_n, INPUT_DATATYPE, args.vector_size) @@ -163,48 +126,30 @@ def herd_body(_tx, _ty, _sx, _sy, _l3_in, _l3_out): np.random.seed(0) input_a = np.random.uniform(-4.0, 4.0, args.n).astype(INPUT_DATATYPE) - if args.compile_mode == "compile-and-run": - num_samples = 100 - sampled_indices = np.vstack([np.random.randint(0, args.n, num_samples)]) - - # SiLU reference using tanh-based sigmoid (matches hardware computation) - def silu_ref(x): - x_f32 = x.astype(np.float32) - return x_f32 * 0.5 * (np.tanh(x_f32 / 2.0) + 1.0) + # SiLU reference using tanh-based sigmoid (matches hardware computation) + def silu_ref(x): + x_f32 = x.astype(np.float32) + return x_f32 * 0.5 * (np.tanh(x_f32 / 2.0) + 1.0) - sampled_values = np.array( - [silu_ref(input_a[i]) for i in zip(*sampled_indices)], - dtype=INPUT_DATATYPE, - ) - sampled_data = { - "shape": (args.n,), - "indices": sampled_indices, - "values": sampled_values, - } - - runner = XRTRunner( - verbose=args.verbose, - omit_while_true_loop=False, - output_format=args.output_format, + sampled_indices = np.vstack([np.random.randint(0, args.n, 100)]) + sampled_values = np.array( + [silu_ref(input_a[i]) for i in zip(*sampled_indices)], + dtype=INPUT_DATATYPE, + ) + sampled_data = { + "shape": (args.n,), + "indices": sampled_indices, + "values": sampled_values, + } + + exit( + run_on_npu( + args, + mlir_module, + inputs=[input_a], instance_name="silu", - runtime_loop_tiling_sizes=[4, 4], + stochastic_expected_outputs=[sampled_data], + rtol=1e-1, + atol=5e-2, ) - exit( - runner.run_test( - mlir_module, - inputs=[input_a], - stochastic_expected_outputs=[sampled_data], - rtol=1e-1, - atol=5e-2, - ) - ) - - elif args.compile_mode == "compile-only": - backend = XRTBackend( - verbose=args.verbose, - omit_while_true_loop=False, - output_format=args.output_format, - runtime_loop_tiling_sizes=[4, 4], - ) - module_function = backend.compile(mlir_module) - backend.unload() + ) diff --git a/programming_examples/sine_cosine/sine_cosine.py b/programming_examples/sine_cosine/sine_cosine.py index 008028bf2..9de60847b 100644 --- a/programming_examples/sine_cosine/sine_cosine.py +++ b/programming_examples/sine_cosine/sine_cosine.py @@ -1,6 +1,5 @@ # Copyright (C) 2025, Advanced Micro Devices, Inc. # SPDX-License-Identifier: MIT -import argparse from math import cos, sin, sqrt, exp from air.ir import * @@ -10,8 +9,7 @@ from air.dialects.memref import AllocOp, DeallocOp from air.dialects.func import FuncOp, CallOp from air.dialects.scf import for_, yield_ -from air.backend.xrt_runner import XRTRunner, type_mapper -from air.backend.xrt import XRTBackend +from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu from ml_dtypes import bfloat16 import numpy as np @@ -141,20 +139,7 @@ def herd_body( SIN_OR_COS = "sin" INPUT_DATATYPE = bfloat16 - parser = argparse.ArgumentParser( - prog="run.py", - description="Builds, runs, and tests the passthrough_dma example", - ) - parser.add_argument( - "-v", - "--verbose", - action="store_true", - ) - parser.add_argument( - "-p", - "--print-module-only", - action="store_true", - ) + parser = make_air_parser("Builds, runs, and tests the passthrough_dma example") parser.add_argument( "--n", type=int, @@ -175,22 +160,6 @@ def herd_body( choices=["sin", "cos"], help="Sine or cosine mode (must be one of [sin, cos])", ) - parser.add_argument( - "--compile-mode", - type=str, - choices=["compile-only", "compile-and-run"], - dest="compile_mode", - default="compile-and-run", - help="Configure to whether to run after compile", - ) - parser.add_argument( - "--output-format", - type=str, - choices=["xclbin", "elf"], - default="xclbin", - dest="output_format", - help="Output format for the compiled binary (default: xclbin)", - ) args = parser.parse_args() @@ -217,33 +186,13 @@ def herd_body( else: raise AssertionError - if args.compile_mode == "compile-and-run": - - ###### Compile and test - runner = XRTRunner( - verbose=args.verbose, - omit_while_true_loop=False, - output_format=args.output_format, + exit( + run_on_npu( + args, + mlir_module, + inputs=[inputs], instance_name="sine_cosine", - runtime_loop_tiling_sizes=[4, 4], - ) - exit( - runner.run_test( - mlir_module, - inputs=[inputs], - expected_outputs=[outputs], - rtol=1e0, - ) + expected_outputs=[outputs], + rtol=1e0, ) - - elif args.compile_mode == "compile-only": - ###### Compile only - backend = XRTBackend( - verbose=args.verbose, - omit_while_true_loop=False, - output_format=args.output_format, - runtime_loop_tiling_sizes=[4, 4], - ) - module_function = backend.compile(mlir_module) - - backend.unload() + ) diff --git a/programming_examples/softmax/softmax.py b/programming_examples/softmax/softmax.py index 24c5daaf0..0383f4d17 100644 --- a/programming_examples/softmax/softmax.py +++ b/programming_examples/softmax/softmax.py @@ -1,17 +1,14 @@ # Copyright (C) 2025, Advanced Micro Devices, Inc. # SPDX-License-Identifier: MIT -import argparse from math import cos, sin, sqrt, exp from air.ir import * -from air.dialects.affine import apply as affine_apply from air.dialects.air import * from air.dialects.arith import ConstantOp from air.dialects.memref import AllocOp, DeallocOp from air.dialects.func import FuncOp, CallOp from air.dialects.scf import for_, yield_ -from air.backend.xrt_runner import XRTRunner, type_mapper -from air.backend.xrt import XRTBackend +from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu from ml_dtypes import bfloat16 import numpy as np @@ -32,11 +29,7 @@ def build_module(n, tile_n, herd_n, np_dtype_in): l3memrefTy = MemRefType.get(a_size, xrt_dtype_in) # L1 MemRefTypes - l1MemrefTy = MemRefType.get( - shape=[tile_n], - element_type=xrt_dtype_in, - memory_space=IntegerAttr.get(T.i32(), MemorySpace.L1), - ) + l1MemrefTy = l1_memref_type([tile_n], xrt_dtype_in) # Function declaration softmax_func = FuncOp( @@ -68,20 +61,7 @@ def herd_body( for t in range_(0, n, tile_n * herd_n): - offset_map = AffineMap.get( - 0, - 2, - [ - AffineExpr.get_add( - AffineSymbolExpr.get(0), - AffineExpr.get_mul( - AffineSymbolExpr.get(1), - AffineConstantExpr.get(tile_n), - ), - ) - ], - ) - offset = affine_apply(offset_map, [t, _ty]) + offset = tile_offset_1d(t, _ty, tile_n) dma_memcpy_nd( l1_a_data, @@ -121,20 +101,7 @@ def herd_body( HERD_N = 4 INPUT_DATATYPE = bfloat16 - parser = argparse.ArgumentParser( - prog="run.py", - description="Builds, runs, and tests the passthrough_dma example", - ) - parser.add_argument( - "-v", - "--verbose", - action="store_true", - ) - parser.add_argument( - "-p", - "--print-module-only", - action="store_true", - ) + parser = make_air_parser("Builds, runs, and tests the passthrough_dma example") parser.add_argument( "--n", type=int, @@ -148,22 +115,6 @@ def herd_body( default=HERD_N, help="Number of L1 tiles along the N dimension", ) - parser.add_argument( - "--compile-mode", - type=str, - choices=["compile-only", "compile-and-run"], - dest="compile_mode", - default="compile-and-run", - help="Configure to whether to run after compile", - ) - parser.add_argument( - "--output-format", - type=str, - choices=["xclbin", "elf"], - default="xclbin", - dest="output_format", - help="Output format for the compiled binary (default: xclbin)", - ) args = parser.parse_args() @@ -191,33 +142,13 @@ def herd_body( for i in range(args.tile_n): outputs[j][i] = outputs[j][i] / sum_val - if args.compile_mode == "compile-and-run": - - ###### Compile and test - runner = XRTRunner( - verbose=args.verbose, - omit_while_true_loop=False, - output_format=args.output_format, - instance_name="softmax", - runtime_loop_tiling_sizes=[4, 4], - ) - exit( - runner.run_test( - mlir_module, - inputs=[inputs], - expected_outputs=[outputs], - rtol=1e-1, - ) - ) - - elif args.compile_mode == "compile-only": - ###### Compile only - backend = XRTBackend( - verbose=args.verbose, - omit_while_true_loop=False, - output_format=args.output_format, - runtime_loop_tiling_sizes=[4, 4], - ) - module_function = backend.compile(mlir_module) - - backend.unload() + run_on_npu( + args, + mlir_module, + inputs=[inputs], + expected_outputs=[outputs], + instance_name="softmax", + omit_while_true_loop=False, + runtime_loop_tiling_sizes=[4, 4], + rtol=1e-1, + ) diff --git a/programming_examples/swiglu/swiglu.py b/programming_examples/swiglu/swiglu.py index 879b1d261..7e3ef0952 100644 --- a/programming_examples/swiglu/swiglu.py +++ b/programming_examples/swiglu/swiglu.py @@ -21,7 +21,6 @@ Computation is vectorized using vector.transfer_read/write. """ -import argparse import numpy as np from ml_dtypes import bfloat16 @@ -33,8 +32,7 @@ from air.dialects.vector import transfer_read, transfer_write, BroadcastOp from air.dialects.func import FuncOp from air.dialects.scf import for_, yield_ -from air.backend.xrt_runner import XRTRunner, type_mapper -from air.backend.xrt import XRTBackend +from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu range_ = for_ @@ -51,21 +49,12 @@ def build_module(n, tile_n, np_dtype_in, vector_size=16): # gate and up packed as [2, N]: row 0 = gate, row 1 = up l3GateUpTy = MemRefType.get([2, n], xrt_dtype_in) - l1_mem_space = IntegerAttr.get(T.i32(), MemorySpace.L1) - l1MemrefTy = MemRefType.get( - shape=[tile_n], - element_type=xrt_dtype_in, - memory_space=l1_mem_space, - ) + l1MemrefTy = l1_memref_type([tile_n], xrt_dtype_in) # L1 buffer for gate+up tile: flat [2*tile_n] for simple 1D indexing - l1GateUpTy = MemRefType.get( - shape=[2 * tile_n], - element_type=xrt_dtype_in, - memory_space=l1_mem_space, - ) + l1GateUpTy = l1_memref_type([2 * tile_n], xrt_dtype_in) - vecTy = VectorType.get([VECTOR_SIZE], xrt_dtype_in) - identity_map = AffineMapAttr.get(AffineMap.get_identity(1)) + vecTy = vec_type(VECTOR_SIZE, xrt_dtype_in) + identity_map = identity_map_attr() @FuncOp.from_py_func(l3memrefTy, l3GateUpTy, l3memrefTy) def swiglu(arg0, arg1, arg2): @@ -175,31 +164,12 @@ def herd_body( TILE_N = 1024 INPUT_DATATYPE = bfloat16 - parser = argparse.ArgumentParser( - prog="run.py", - description="Builds, runs, and tests the SwiGLU example", - ) - parser.add_argument("-v", "--verbose", action="store_true") - parser.add_argument("-p", "--print-module-only", action="store_true") + parser = make_air_parser("Builds, runs, and tests the SwiGLU example") parser.add_argument("--n", type=int, default=N, help="Total number of elements") parser.add_argument("--tile-n", type=int, default=TILE_N, help="Tile size") parser.add_argument( "--vector-size", type=int, default=16, help="Vector size for SIMD operations" ) - parser.add_argument( - "--compile-mode", - type=str, - choices=["compile-only", "compile-and-run"], - dest="compile_mode", - default="compile-and-run", - ) - parser.add_argument( - "--output-format", - type=str, - choices=["xclbin", "elf"], - default="xclbin", - dest="output_format", - ) args = parser.parse_args() mlir_module = build_module(args.n, args.tile_n, INPUT_DATATYPE, args.vector_size) @@ -215,55 +185,39 @@ def herd_body( # Pack gate and up into [2, N]: row 0 = gate, row 1 = up input_gate_up = np.stack([input_gate, input_up]).astype(INPUT_DATATYPE) - if args.compile_mode == "compile-and-run": - num_samples = 100 - sampled_indices = np.vstack([np.random.randint(0, args.n, num_samples)]) - - # SwiGLU reference using tanh-based sigmoid (matches hardware computation) - def swiglu_ref(x, gate, up): - x_f32 = x.astype(np.float32) - g_f32 = gate.astype(np.float32) - u_f32 = up.astype(np.float32) - xg = x_f32 * g_f32 - silu_xg = xg * 0.5 * (np.tanh(xg / 2.0) + 1.0) - return silu_xg * (x_f32 * u_f32) - - sampled_values = np.array( - [ - swiglu_ref(input_x[i], input_gate[i], input_up[i]) - for i in zip(*sampled_indices) - ], - dtype=INPUT_DATATYPE, - ) - sampled_data = { - "shape": (args.n,), - "indices": sampled_indices, - "values": sampled_values, - } - - runner = XRTRunner( - verbose=args.verbose, - omit_while_true_loop=False, - output_format=args.output_format, - instance_name="swiglu", - runtime_loop_tiling_sizes=[4, 4], - ) - exit( - runner.run_test( - mlir_module, - inputs=[input_x, input_gate_up], - stochastic_expected_outputs=[sampled_data], - rtol=1e-1, - atol=5e-2, - ) - ) - - elif args.compile_mode == "compile-only": - backend = XRTBackend( - verbose=args.verbose, - omit_while_true_loop=False, - output_format=args.output_format, - runtime_loop_tiling_sizes=[4, 4], - ) - module_function = backend.compile(mlir_module) - backend.unload() + num_samples = 100 + sampled_indices = np.vstack([np.random.randint(0, args.n, num_samples)]) + + # SwiGLU reference using tanh-based sigmoid (matches hardware computation) + def swiglu_ref(x, gate, up): + x_f32 = x.astype(np.float32) + g_f32 = gate.astype(np.float32) + u_f32 = up.astype(np.float32) + xg = x_f32 * g_f32 + silu_xg = xg * 0.5 * (np.tanh(xg / 2.0) + 1.0) + return silu_xg * (x_f32 * u_f32) + + sampled_values = np.array( + [ + swiglu_ref(input_x[i], input_gate[i], input_up[i]) + for i in zip(*sampled_indices) + ], + dtype=INPUT_DATATYPE, + ) + sampled_data = { + "shape": (args.n,), + "indices": sampled_indices, + "values": sampled_values, + } + + run_on_npu( + args, + mlir_module, + inputs=[input_x, input_gate_up], + stochastic_expected_outputs=[sampled_data], + instance_name="swiglu", + omit_while_true_loop=False, + runtime_loop_tiling_sizes=[4, 4], + rtol=1e-1, + atol=5e-2, + ) diff --git a/programming_examples/utils.py b/programming_examples/utils.py new file mode 100644 index 000000000..a2e5d77d5 --- /dev/null +++ b/programming_examples/utils.py @@ -0,0 +1,167 @@ +# Copyright (C) 2026, Advanced Micro Devices, Inc. +# SPDX-License-Identifier: MIT +"""Shared helpers for programming_examples. + +All helpers are importable from any example directory via: + import sys, os; sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) + from utils import ... +or, when the example is one level deep (e.g. relu/relu.py): + import sys, os; sys.path.insert(0, os.path.dirname(os.path.dirname(__file__))) + from utils import ... +""" + +import argparse +import numpy as np + +from air.ir import * +from air.dialects.affine import apply as affine_apply +from air.dialects import arith +from air.dialects.memref import subview +from air.dialects.vector import transfer_read, transfer_write +from air.dialects.air import MemorySpace +from air.backend.xrt_runner import type_mapper, run_on_npu, make_air_parser +from air.backend.xrt import compile_air, get_air_runtime, XRTTensor +from air.extras import types as T + +# --------------------------------------------------------------------------- +# MLIR construct helpers (used inside @module_builder) +# --------------------------------------------------------------------------- + + +def make_l1_memref(shape, dtype): + """MemRefType in L1 (per-core scratchpad) memory space.""" + return MemRefType.get( + shape, dtype, memory_space=IntegerAttr.get(T.i32(), MemorySpace.L1) + ) + + +def make_l2_memref(shape, dtype): + """MemRefType in L2 (segment-shared) memory space.""" + return MemRefType.get( + shape, dtype, memory_space=IntegerAttr.get(T.i32(), MemorySpace.L2) + ) + + +def make_vec_type(size, dtype): + """1D VectorType of given length and element type.""" + return VectorType.get([size], dtype) + + +def identity_map_1d(): + """1D identity AffineMapAttr — the standard transfer_read/write permutation map.""" + return AffineMapAttr.get(AffineMap.get_identity(1)) + + +def tiled_1d_offset(loop_var, tile_idx, tile_n): + """ + Compute offset = loop_var + tile_idx * tile_n via affine_apply. + + Replaces the 12-line AffineMap.get / AffineExpr chain used in every + 1D vectorized example with a 1x2 herd. + + Args: + loop_var: outer loop induction variable (SSA Value or int) + tile_idx: herd tile index, e.g. _ty (SSA Value or int) + tile_n: tile size in elements (Python int) + """ + offset_map = AffineMap.get( + 0, + 2, + [ + AffineExpr.get_add( + AffineSymbolExpr.get(0), + AffineExpr.get_mul( + AffineSymbolExpr.get(1), + AffineConstantExpr.get(tile_n), + ), + ) + ], + ) + return affine_apply(offset_map, [loop_var, tile_idx]) + + +def vec_read(buf, j, vec_size, c0, vec_ty, cst0, imap): + """subview + transfer_read with the standard fixed call signature.""" + result = buf.result if hasattr(buf, "result") else buf + sub = subview(result, [j], [vec_size], [1]) + return transfer_read(vec_ty, sub, [c0], imap, cst0, [True]) + + +def vec_write(val, buf, j, vec_size, c0, imap): + """subview + transfer_write with the standard fixed call signature.""" + result = buf.result if hasattr(buf, "result") else buf + sub = subview(result, [j], [vec_size], [1]) + transfer_write(None, val, sub, [c0], imap, [True]) + + +# --------------------------------------------------------------------------- +# Argument-parser factory +# --------------------------------------------------------------------------- + + +def make_air_parser(description, prog="run.py"): + """ + Return an ArgumentParser pre-populated with the 4 universal flags: + -v / --verbose + -p / --print-module-only + --compile-mode {compile-only, compile-and-run} + --output-format {xclbin, elf} + + The caller adds example-specific arguments (--n, --tile-n, etc.) after. + """ + p = argparse.ArgumentParser(prog=prog, description=description) + p.add_argument("-v", "--verbose", action="store_true") + p.add_argument("-p", "--print-module-only", action="store_true") + p.add_argument( + "--compile-mode", + type=str, + choices=["compile-only", "compile-and-run"], + dest="compile_mode", + default="compile-and-run", + ) + p.add_argument( + "--output-format", + type=str, + choices=["xclbin", "elf"], + default="xclbin", + dest="output_format", + ) + return p + + +# --------------------------------------------------------------------------- +# Stochastic sampling helper +# --------------------------------------------------------------------------- + + +def stochastic_check(inputs, n, ref_fn, dtype, num_samples=100): + """ + Build the stochastic_expected_outputs dict for 1D element-wise ops. + + Args: + inputs: list of numpy input arrays (same as passed to run_test) + n: total element count + ref_fn: scalar reference function, called as ref_fn(*scalars) + dtype: output numpy dtype + num_samples: number of randomly sampled indices + Returns: + dict with "shape", "indices", "values" for run_on_npu() stochastic verification + """ + sampled_indices = np.vstack([np.random.randint(0, n, num_samples)]) + sampled_values = np.array( + [ref_fn(*[inp[i] for inp in inputs]) for i in zip(*sampled_indices)], + dtype=dtype, + ) + return {"shape": (n,), "indices": sampled_indices, "values": sampled_values} + + +# --------------------------------------------------------------------------- +# Print-module-only convenience +# --------------------------------------------------------------------------- + + +def check_print_module(mlir_module, args): + """Print the MLIR module and exit if --print-module-only was passed.""" + if args.print_module_only: + print(mlir_module) + exit(0) diff --git a/programming_examples/vector_matrix_multiplication/bf16/single_core/single_core.py b/programming_examples/vector_matrix_multiplication/bf16/single_core/single_core.py index 5914a656e..41a3da919 100644 --- a/programming_examples/vector_matrix_multiplication/bf16/single_core/single_core.py +++ b/programming_examples/vector_matrix_multiplication/bf16/single_core/single_core.py @@ -1,6 +1,5 @@ # Copyright (C) 2025, Advanced Micro Devices, Inc. # SPDX-License-Identifier: MIT -import argparse from ml_dtypes import bfloat16 from air.ir import * @@ -10,7 +9,7 @@ from air.dialects.memref import AllocOp, DeallocOp, load, store from air.dialects.func import FuncOp, CallOp from air.dialects.scf import for_, yield_ -from air.backend.xrt_runner import XRTRunner, type_mapper +from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu range_ = for_ @@ -272,20 +271,7 @@ def herd_body(_tx, _ty, _sx, _sy): INPUT_DATATYPE = bfloat16 OUTPUT_DATATYPE = bfloat16 - parser = argparse.ArgumentParser( - prog="run.py", - description="Builds, runs, and tests the passthrough_dma example", - ) - parser.add_argument( - "-v", - "--verbose", - action="store_true", - ) - parser.add_argument( - "-p", - "--print-module-only", - action="store_true", - ) + parser = make_air_parser("Builds, runs, and tests the passthrough_dma example") parser.add_argument( "--k", type=int, default=K, help="K dimension size in a (1xK) * (KxN) matmul" ) @@ -301,14 +287,6 @@ def herd_body(_tx, _ty, _sx, _sy): parser.add_argument( "--tile-n", type=int, default=TILE_N, help="N dimension size of each L1 tile" ) - parser.add_argument( - "--output-format", - type=str, - choices=["xclbin", "elf"], - default="xclbin", - dest="output_format", - help="Output format for the compiled binary (default: xclbin)", - ) args = parser.parse_args() @@ -330,17 +308,12 @@ def herd_body(_tx, _ty, _sx, _sy): ) output_c = np.dot(input_a.astype(OUTPUT_DATATYPE), input_b.astype(OUTPUT_DATATYPE)) - runner = XRTRunner( - verbose=args.verbose, - omit_while_true_loop=False, - output_format=args.output_format, - instance_name="vecmat_bf16", - runtime_loop_tiling_sizes=[4, 4], - ) exit( - runner.run_test( + run_on_npu( + args, mlir_module, inputs=[input_a, input_b], + instance_name="vecmat_bf16", expected_outputs=[output_c], rtol=0.04, ) diff --git a/programming_examples/vector_matrix_multiplication/block_quantized_i8/single_core/single_core.py b/programming_examples/vector_matrix_multiplication/block_quantized_i8/single_core/single_core.py index 79d89f8bb..e8f60b221 100644 --- a/programming_examples/vector_matrix_multiplication/block_quantized_i8/single_core/single_core.py +++ b/programming_examples/vector_matrix_multiplication/block_quantized_i8/single_core/single_core.py @@ -1,7 +1,5 @@ # Copyright (C) 2024, Advanced Micro Devices, Inc. # SPDX-License-Identifier: MIT -import argparse - from air.ir import * from air.dialects.affine import apply as affine_apply from air.dialects.air import * @@ -9,7 +7,7 @@ from air.dialects.memref import AllocOp, DeallocOp, load, store from air.dialects.func import FuncOp, CallOp from air.dialects.scf import for_, yield_ -from air.backend.xrt_runner import XRTRunner, type_mapper +from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu import numpy as np @@ -404,20 +402,7 @@ def herd_body(_tx, _ty, _sx, _sy): ACC_DATATYPE = np.int32 OUTPUT_DATATYPE = np.float32 - parser = argparse.ArgumentParser( - prog="run.py", - description="Builds, runs, and tests the passthrough_dma example", - ) - parser.add_argument( - "-v", - "--verbose", - action="store_true", - ) - parser.add_argument( - "-p", - "--print-module-only", - action="store_true", - ) + parser = make_air_parser("Builds, runs, and tests the passthrough_dma example") parser.add_argument( "--k", type=int, default=K, help="K dimension size in a (1xK) * (KxN) matmul" ) @@ -439,14 +424,6 @@ def herd_body(_tx, _ty, _sx, _sy): parser.add_argument( "--tile-n", type=int, default=TILE_N, help="N dimension size of each L1 tile" ) - parser.add_argument( - "--output-format", - type=str, - choices=["xclbin", "elf"], - default="xclbin", - dest="output_format", - help="Output format for the compiled binary (default: xclbin)", - ) args = parser.parse_args() @@ -499,17 +476,12 @@ def herd_body(_tx, _ty, _sx, _sy): ) ival = 0 - runner = XRTRunner( - verbose=args.verbose, - omit_while_true_loop=False, - output_format=args.output_format, - instance_name="vecmat_i8", - runtime_loop_tiling_sizes=[4, 4], - ) exit( - runner.run_test( + run_on_npu( + args, mlir_module, inputs=[input_a, input_a_s, input_b, input_b_s], + instance_name="vecmat_i8", expected_outputs=[output_c], ) ) diff --git a/programming_examples/vector_matrix_multiplication/i8/single_core/single_core.py b/programming_examples/vector_matrix_multiplication/i8/single_core/single_core.py index 7dab0af7a..154587707 100644 --- a/programming_examples/vector_matrix_multiplication/i8/single_core/single_core.py +++ b/programming_examples/vector_matrix_multiplication/i8/single_core/single_core.py @@ -1,7 +1,5 @@ # Copyright (C) 2025, Advanced Micro Devices, Inc. # SPDX-License-Identifier: MIT -import argparse - from air.ir import * from air.dialects.affine import apply as affine_apply from air.dialects.air import * @@ -9,7 +7,7 @@ from air.dialects.memref import AllocOp, DeallocOp, load, store from air.dialects.func import FuncOp, CallOp from air.dialects.scf import for_, yield_ -from air.backend.xrt_runner import XRTRunner, type_mapper +from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu import numpy as np @@ -289,20 +287,7 @@ def herd_body(_tx, _ty, _sx, _sy): INPUT_DATATYPE = np.int8 OUTPUT_DATATYPE = np.int32 - parser = argparse.ArgumentParser( - prog="run.py", - description="Builds, runs, and tests the passthrough_dma example", - ) - parser.add_argument( - "-v", - "--verbose", - action="store_true", - ) - parser.add_argument( - "-p", - "--print-module-only", - action="store_true", - ) + parser = make_air_parser("Builds, runs, and tests the passthrough_dma example") parser.add_argument( "--k", type=int, default=K, help="K dimension size in a (1xK) * (KxN) matmul" ) @@ -318,14 +303,6 @@ def herd_body(_tx, _ty, _sx, _sy): parser.add_argument( "--tile-n", type=int, default=TILE_N, help="N dimension size of each L1 tile" ) - parser.add_argument( - "--output-format", - type=str, - choices=["xclbin", "elf"], - default="xclbin", - dest="output_format", - help="Output format for the compiled binary (default: xclbin)", - ) args = parser.parse_args() @@ -357,17 +334,12 @@ def herd_body(_tx, _ty, _sx, _sy): input_b = input_b.astype(INPUT_DATATYPE) output_c = np.dot(input_a.astype(OUTPUT_DATATYPE), input_b.astype(OUTPUT_DATATYPE)) - runner = XRTRunner( - verbose=args.verbose, - omit_while_true_loop=False, - output_format=args.output_format, - instance_name="vecmat_i8", - runtime_loop_tiling_sizes=[4, 4], - ) exit( - runner.run_test( + run_on_npu( + args, mlir_module, inputs=[input_a, input_b], + instance_name="vecmat_i8", expected_outputs=[output_c], ) ) diff --git a/programming_examples/weighted_rms_norm/weighted_rms_norm.py b/programming_examples/weighted_rms_norm/weighted_rms_norm.py index 9a15be31c..8c6c9e6e3 100644 --- a/programming_examples/weighted_rms_norm/weighted_rms_norm.py +++ b/programming_examples/weighted_rms_norm/weighted_rms_norm.py @@ -15,7 +15,6 @@ configurable VECTOR_SIZE (default 16 for AIE2). """ -import argparse import numpy as np from ml_dtypes import bfloat16 @@ -31,8 +30,7 @@ ) from air.dialects.func import FuncOp from air.dialects.scf import for_, yield_ -from air.backend.xrt_runner import XRTRunner, type_mapper -from air.backend.xrt import XRTBackend +from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu range_ = for_ @@ -46,17 +44,16 @@ def build_module(M, N, np_dtype, vector_size=16): N % vector_size == 0 ), f"N ({N}) must be divisible by vector_size ({vector_size})" - vecTy = VectorType.get([vector_size], xrt_dtype) - identity_map = AffineMapAttr.get(AffineMap.get_identity(1)) + vecTy = vec_type(vector_size, xrt_dtype) + identity_map = identity_map_attr() # L3 types l3MemrefTy = MemRefType.get([M, N], xrt_dtype) l3WeightTy = MemRefType.get([N], xrt_dtype) # L1 types - l1_mem_space = IntegerAttr.get(T.i32(), MemorySpace.L1) - l1RowTy = MemRefType.get([N], xrt_dtype, memory_space=l1_mem_space) - l1VecTy = MemRefType.get([vector_size], xrt_dtype, memory_space=l1_mem_space) + l1RowTy = l1_memref_type([N], xrt_dtype) + l1VecTy = l1_memref_type([vector_size], xrt_dtype) @FuncOp.from_py_func(l3MemrefTy, l3WeightTy, l3MemrefTy) def weighted_rms_norm(arg0, arg1, arg2): @@ -161,12 +158,9 @@ def herd_body(_tx, _ty, _sx, _sy, l3_in, l3_weight, l3_out): VECTOR_SIZE = 16 INPUT_DATATYPE = bfloat16 - parser = argparse.ArgumentParser( - prog="run.py", - description="Builds, runs, and tests the weighted RMS normalization example", + parser = make_air_parser( + "Builds, runs, and tests the weighted RMS normalization example" ) - parser.add_argument("-v", "--verbose", action="store_true") - parser.add_argument("-p", "--print-module-only", action="store_true") parser.add_argument("--M", type=int, default=M_DEFAULT, help="M dimension (rows)") parser.add_argument("--N", type=int, default=N_DEFAULT, help="N dimension (cols)") parser.add_argument( @@ -175,20 +169,6 @@ def herd_body(_tx, _ty, _sx, _sy, l3_in, l3_weight, l3_out): default=VECTOR_SIZE, help="Vector size for SIMD operations", ) - parser.add_argument( - "--compile-mode", - type=str, - choices=["compile-only", "compile-and-run"], - dest="compile_mode", - default="compile-and-run", - ) - parser.add_argument( - "--output-format", - type=str, - choices=["xclbin", "elf"], - default="xclbin", - dest="output_format", - ) args = parser.parse_args() mlir_module = build_module(args.M, args.N, INPUT_DATATYPE, args.vector_size) @@ -209,30 +189,14 @@ def herd_body(_tx, _ty, _sx, _sy, l3_in, l3_weight, l3_out): (x_input.astype(np.float32) / rms) * weight.astype(np.float32) ).astype(INPUT_DATATYPE) - if args.compile_mode == "compile-and-run": - runner = XRTRunner( - verbose=args.verbose, - omit_while_true_loop=False, - output_format=args.output_format, + exit( + run_on_npu( + args, + mlir_module, + inputs=[x_input, weight], instance_name="weighted_rms_norm", - runtime_loop_tiling_sizes=[4, 4], + expected_outputs=[y_expected], + rtol=5e-2, + atol=5e-1, ) - exit( - runner.run_test( - mlir_module, - inputs=[x_input, weight], - expected_outputs=[y_expected], - rtol=5e-2, - atol=5e-1, - ) - ) - - elif args.compile_mode == "compile-only": - backend = XRTBackend( - verbose=args.verbose, - omit_while_true_loop=False, - output_format=args.output_format, - runtime_loop_tiling_sizes=[4, 4], - ) - module_function = backend.compile(mlir_module) - backend.unload() + ) diff --git a/python/air/backend/xrt.py b/python/air/backend/xrt.py index 259205361..9b3cbe68f 100644 --- a/python/air/backend/xrt.py +++ b/python/air/backend/xrt.py @@ -3,15 +3,33 @@ # Copyright (C) 2024, Advanced Micro Devices, Inc. # SPDX-License-Identifier: MIT +""" +XRT backend for mlir-air. + +Public API +---------- +compile_air(air_module, ...) -> NPUKernel + Compile an AIR dialect MLIR module to an NPU kernel artifact. + +AirRuntime (CachedXRTRuntime subclass) + Richer verify_results() with rtol/atol, stochastic sampling, + mismatch budget, and Pearson correlation. + +get_air_runtime() -> AirRuntime + Return the process-level singleton AirRuntime. + +XRTTensor (re-exported from aie.utils) + Numpy-backed buffer object for XRT. +""" + import air.ir import air.passmanager -from .abc import AirBackend, AirBackendError +from .abc import AirBackendError import air.compiler.util # Register the AIR dialect so air.ir.Context() can parse AIR ops. -# This was previously done as a side effect of importing aircc.main. from air.dialects import air as _air_dialect # noqa: F401 import numpy as np @@ -21,31 +39,653 @@ from ml_dtypes import bfloat16 +# --------------------------------------------------------------------------- +# mlir-aie runtime imports +# --------------------------------------------------------------------------- +try: + import aie.utils as _aie_utils + from aie.utils import CachedXRTRuntime, NPUKernel + from aie.utils.hostruntime.xrtruntime.tensor import XRTTensor + + _HAS_AIE_RUNTIME = True + # Factory function that selects XRTTensor or CPUOnlyTensor depending on + # whether pyxrt is importable. Used throughout instead of XRTTensor() directly. + _tensor = _aie_utils.tensor +except ImportError: + _HAS_AIE_RUNTIME = False + CachedXRTRuntime = object # fallback base so class definition succeeds + NPUKernel = None + XRTTensor = None + _tensor = None + + +# --------------------------------------------------------------------------- +# compile_air() — replaces XRTBackend.compile() +# --------------------------------------------------------------------------- + + +def compile_air( + air_module: air.ir.Module, + *, + verbose: bool = False, + target_device: str = None, + omit_while_true_loop: bool = False, + omit_pingpong: str = "", + lower_linalg_to_func=None, + air_loop_fusion: bool = False, + runtime_loop_tiling_sizes=None, + omit_auto_broadcast: bool = False, + channel_multiplexing=None, + use_lock_race_condition_fix: bool = False, + trace_offset: int = 0, + trace_size: int = 0, + output_format: str = "xclbin", + xclbin_kernel_name: str = "", + instance_name: str = "", + kernel_id: str = "", + xclbin_input: str = "", + num_device_cols: int = 0, + debug_ir: bool = False, + bf16_emulation: bool = False, + # Legacy aliases kept for backward compat + kernel_name: str = "", + output_binary_name: str = "air", + insts: str = "air.insts.bin", +): + """Compile an AIR dialect MLIR module to an NPUKernel artifact. + + Replaces ``XRTBackend(...).compile(air_module)``. + + Args: + air_module: The MLIR module in AIR dialect. + verbose: Verbose output. + target_device: Explicit target device ("npu1", "npu2", etc.). + If None, auto-detect via xrt-smi. + omit_while_true_loop: Omit the while-true loop in generated code. + omit_pingpong: Omit ping-pong buffering for given memory level. + Values: "", "L1", "L2", "all". + lower_linalg_to_func: Lower linalg.generic to function calls. + air_loop_fusion: Enable air-loop-fusion pass. + runtime_loop_tiling_sizes: Extra runtime loop tiling sizes. + omit_auto_broadcast: Omit automatic broadcast detection. + channel_multiplexing: Air channel multiplexing memory spaces. + use_lock_race_condition_fix: Enable lock race condition fix. + trace_offset: Trace output offset (bytes). + trace_size: Trace output size (bytes). + output_format: Output binary format: "xclbin", "elf", or "txn". + xclbin_kernel_name: Kernel name embedded in xclbin metadata. + instance_name: Instance name embedded in xclbin metadata. + kernel_id: Kernel ID embedded in xclbin file. + xclbin_input: Existing xclbin to embed the new kernel into. + num_device_cols: Device columns to constrain the design to (0=all). + debug_ir: Save IR after each pass to debug_ir/ directory. + bf16_emulation: Emulate f32 vector arithmetic with bf16. + kernel_name: Legacy alias for xclbin_kernel_name. + output_binary_name: Base name for the output binary (without extension). + insts: Instruction filename (for xclbin format). + + Returns: + NPUKernel: Compiled kernel artifact with xclbin/insts paths. + """ + if runtime_loop_tiling_sizes is None: + runtime_loop_tiling_sizes = [] + if channel_multiplexing is None: + channel_multiplexing = [] + + # Support legacy kernel_name alias + effective_kernel_name = xclbin_kernel_name or kernel_name + + # Support backward compatibility: convert bool omit_pingpong + if isinstance(omit_pingpong, bool): + omit_pingpong = "all" if omit_pingpong else "" + + # Determine target device + if target_device is not None: + if verbose: + print(f"Using explicitly specified target device: {target_device}") + else: + target_device = "npu1" # default fallback + try: + import re -class XRTCompileArtifact: - """A class encompassing information on the artifacts produced by compilation for the NPU/XRT""" + xrtsmi = "/opt/xilinx/xrt/bin/xrt-smi" + result = subprocess.run( + [xrtsmi, "examine"], stdout=subprocess.PIPE, stderr=subprocess.PIPE + ) + result_lines = result.stdout.decode("utf-8").split("\n") + p = re.compile(r"[\|]?(\[.+:.+:.+\]).+\|(RyzenAI-(npu\d)|NPU (\w+))\W*\|") + for line in result_lines: + m = p.match(line) + if not m: + continue + if verbose: + print("Found Ryzen AI device:", m.group(1)) + model = "unknown" + if m.group(3): + model = str(m.group(3)) + if m.group(4): + model = str(m.group(4)) + if verbose: + print(f"\tmodel: '{model}'") + if model in ["npu1", "Phoenix"]: + target_device = "npu1" + elif model in ["npu4", "Strix"]: + target_device = "npu2" + else: + print(f"WARNING: xrt-smi reported unknown NPU model '{model}'.") + break + except Exception as e: + if verbose: + print("Failed to run xrt-smi, using default target device") + print(e) + + # Validate ELF format compatibility + if output_format == "elf" and "npu1" in target_device: + raise AirBackendError( + f"output_format='elf' is not supported for {target_device} target. " + "ELF output format is only supported on npu2 and later devices." + ) + + # Apply column configuration + if num_device_cols > 0: + max_cols = 4 if target_device == "npu1" else 8 + if num_device_cols > max_cols - 1: + raise AirBackendError( + f"Invalid num_device_cols value: {num_device_cols}. " + f"For {target_device}, valid values are 0 (entire device) or 1-{max_cols-1}" + ) + base_device = target_device + target_device = f"{target_device}_{num_device_cols}col" + if verbose: + print( + f"Confining design to {num_device_cols} column(s) of {base_device} device: {target_device}" + ) - def __init__( + # Determine peano toolchain + peano_package_dir = os.environ.get("PEANO_INSTALL_DIR", "") + if peano_package_dir and os.path.isdir(peano_package_dir): + print( + "compile_air: llvm-aie package detected via PEANO_INSTALL_DIR:", + peano_package_dir, + ) + + # Determine output binary file name + if output_format == "elf": + output_binary = f"{output_binary_name}.elf" + elif output_format == "txn": + output_binary = f"{output_binary_name}.txn" + else: # xclbin (default) + output_binary = f"{output_binary_name}.xclbin" + + with air.ir.Context(): + if verbose: + print("AIR Module:") + print(air_module) + + aircc_options = [ + "--device", + target_device, + "air.mlir", + ] + + # Output file options + if output_format == "elf": + aircc_options += ["--elf-name", output_binary] + else: + aircc_options += ["-o", output_binary] + aircc_options += ["-i", insts] + + for s in runtime_loop_tiling_sizes: + aircc_options += [f"--air-runtime-loop-tiling-sizes={s}"] + + if verbose: + aircc_options = aircc_options + ["-v"] + + if omit_while_true_loop: + aircc_options += ["--omit-while-true-loop"] + + if omit_pingpong: + pp_val = "all" if omit_pingpong is True else str(omit_pingpong) + aircc_options += [f"--omit-ping-pong-transform={pp_val}"] + + if lower_linalg_to_func: + aircc_options += ["--lower-linalg-to-func"] + aircc_options += [lower_linalg_to_func] + + if air_loop_fusion: + aircc_options += ["--air-loop-fusion"] + + if omit_auto_broadcast: + aircc_options += ["--omit-auto-broadcast"] + + if len(channel_multiplexing) != 0: + for ch in channel_multiplexing: + aircc_options += [f"--air-channel-multiplexing={ch}"] + + if use_lock_race_condition_fix: + aircc_options += ["--use-lock-race-condition-fix"] + + if trace_size != 0: + aircc_options += ["-trace-size"] + aircc_options += [str(trace_size)] + aircc_options += ["-trace-offset"] + aircc_options += [str(trace_offset)] + + if output_format != "": + aircc_options += ["--output-format"] + aircc_options += [output_format] + if effective_kernel_name != "": + aircc_options += ["--xclbin-kernel-name"] + aircc_options += [effective_kernel_name] + if instance_name != "": + aircc_options += ["--xclbin-instance-name"] + aircc_options += [instance_name] + if kernel_id != "": + aircc_options += ["--xclbin-kernel-id"] + aircc_options += [kernel_id] + if xclbin_input != "": + aircc_options += ["--xclbin-input"] + aircc_options += [xclbin_input] + + if peano_package_dir != "": + aircc_options += ["--peano"] + aircc_options += [peano_package_dir] + aircc_options += ["--no-xchesscc"] + aircc_options += ["--no-xbridge"] + else: + aircc_options += ["--xchesscc"] + aircc_options += ["--xbridge"] + + if debug_ir: + aircc_options += ["--debug-ir"] + + if bf16_emulation: + aircc_options += ["--bf16-emulation"] + + if verbose: + print("Running aircc with options:", " ".join(aircc_options)) + + # Write module to disk for aircc + with open("air.mlir", "w") as f: + f.write(str(air_module)) + + # Invoke aircc + aircc_exe = shutil.which("aircc") + if not aircc_exe: + raise AirBackendError( + "aircc binary not found in PATH. " + "Ensure mlir-air is installed and aircc is on PATH." + ) + result = subprocess.run( + [aircc_exe] + aircc_options, + capture_output=True, + text=True, + ) + if result.returncode != 0: + error_msg = result.stderr if result.stderr else result.stdout + raise AirBackendError(f"aircc compilation failed:\n{error_msg}") + + # Build kernel_name for NPUKernel + if output_format == "elf" and instance_name != "": + npu_kernel_name = f"main:{instance_name}" + else: + npu_kernel_name = effective_kernel_name if effective_kernel_name else "MLIR_AIE" + + if _HAS_AIE_RUNTIME: + return NPUKernel(output_binary, insts, kernel_name=npu_kernel_name) + else: + # Fallback: return a simple namespace when aie.utils is unavailable + import types + + kernel = types.SimpleNamespace( + xclbin_path=output_binary, + insts_path=insts, + kernel_name=npu_kernel_name, + ) + return kernel + + +# --------------------------------------------------------------------------- +# AirRuntime — CachedXRTRuntime with mlir-air's richer verification +# --------------------------------------------------------------------------- + + +class AirRuntime(CachedXRTRuntime): + """ + mlir-aie's CachedXRTRuntime extended with mlir-air's richer verification. + + Inherits: device open, xclbin/ELF caching (32 contexts NPU2), run(). + Overrides: verify_results() with rtol/atol/stochastic/correlation checks. + Adds: run_test() convenience method that compiles, runs, and verifies. + """ + + def run_test( self, - output_binary, - kernel, - insts, - ): + npu_kernel, + io_args, + refs=None, + rtol: float = 1e-3, + atol: float = 1e-8, + max_mismatch_percentage: float = 0.0, + min_correlation=None, + stochastic_refs=None, + verbosity: int = 0, + trace_file: str = "trace_data.txt", + ) -> int: """ - Constructor for an XRTCompileArtifact + Load, run, and verify an NPU kernel. Args: - output_binary: output binary file name/path (.xclbin, .elf, or .txn) - kernel: kernel name - insts: instruction file name/path + npu_kernel: NPUKernel from compile_air(). + io_args: List of XRTTensor objects (inputs + outputs). + refs: dict mapping output index → expected numpy array (dense). + rtol: Relative tolerance for floating-point checks. + atol: Absolute tolerance for floating-point checks. + max_mismatch_percentage: Max % of elements allowed to mismatch. + min_correlation: Minimum Pearson correlation (None = disabled). + stochastic_refs: List of {"shape", "indices", "values"} dicts. + verbosity: Verbosity level. + trace_file: Filename to save trace data (if trace_size > 0). + + Returns: + 0 on pass, -1 on failure. """ + import filelock + + handle = self.load(npu_kernel) + with filelock.FileLock("/tmp/npu.lock"): + self.run(handle, io_args) + + errors = self.verify_results( + io_args, + refs=refs or {}, + rtol=rtol, + atol=atol, + max_mismatch_percentage=max_mismatch_percentage, + min_correlation=min_correlation, + stochastic_refs=stochastic_refs, + verbosity=verbosity, + ) + if errors == 0: + print("PASS!") + return 0 + else: + print("failed.") + return -1 + + @classmethod + def verify_results( + cls, + io_args, + refs=None, + rtol: float = 1e-3, + atol: float = 1e-8, + max_mismatch_percentage: float = 0.0, + min_correlation=None, + stochastic_refs=None, + verbosity: int = 0, + ) -> int: + """ + Verify kernel outputs against reference data. + + Args: + io_args: List of XRTTensor (or numpy array) outputs. + refs: dict {index: expected_np_array} for dense checks. + rtol: Relative tolerance. + atol: Absolute tolerance. + max_mismatch_percentage: Max % of mismatches tolerated (0–100). + min_correlation: Minimum Pearson correlation (None = disabled). + stochastic_refs: List of {"shape","indices","values"} dicts. + verbosity: Verbosity level. + + Returns: + Number of errors found (0 = pass). + """ + if refs is None: + refs = {} + + errors = 0 + np.set_printoptions(formatter={"int": hex}) + + for idx, expected in refs.items(): + raw = io_args[idx] + actual = raw.numpy() if hasattr(raw, "numpy") else np.asarray(raw) + actual = np.reshape(actual, expected.shape) + + if verbosity >= 1: + print(f"Expected output [{idx}]:", expected) + print(f"Actual output [{idx}]:", actual) + + errors += _check_dense( + actual, + expected, + rtol=rtol, + atol=atol, + idx=idx, + max_mismatch_percentage=max_mismatch_percentage, + min_correlation=min_correlation, + ) + + if stochastic_refs: + num_dense = len(refs) + for i, sref in enumerate(stochastic_refs): + raw = io_args[num_dense + i] + actual = raw.numpy() if hasattr(raw, "numpy") else np.asarray(raw) + actual = np.reshape(actual, sref["shape"]) + + if verbosity >= 1: + print(f"Stochastic expected [{i}]: shape={sref['shape']}") + print(f"Stochastic actual [{i}]:", actual) + + errors += _check_stochastic( + actual, + sref, + rtol=rtol, + atol=atol, + idx=i, + max_mismatch_percentage=max_mismatch_percentage, + ) + + return errors + + +# --------------------------------------------------------------------------- +# Module-level singleton +# --------------------------------------------------------------------------- + +_air_runtime = None + + +def get_air_runtime() -> AirRuntime: + """Return the process-level AirRuntime singleton.""" + global _air_runtime + if _air_runtime is None: + if not _HAS_AIE_RUNTIME: + raise AirBackendError( + "aie.utils (mlir-aie) is not available. " + "Install mlir-aie to use AirRuntime." + ) + _air_runtime = AirRuntime() + return _air_runtime + + +# --------------------------------------------------------------------------- +# Internal helpers for verification +# --------------------------------------------------------------------------- + + +def _check_dense( + actual, expected, rtol, atol, idx, max_mismatch_percentage, min_correlation +): + """Dense element-wise check. Returns number of errors (0 or 1).""" + total_elements = expected.size + + if expected.dtype in [np.float16, np.float32, np.float64, bfloat16]: + if expected.dtype == bfloat16: + expected = expected.astype(np.float64) + actual = actual.astype(np.float64) + + # Element-wise tolerance check + close_mask = np.isclose(actual, expected, rtol=rtol, atol=atol) + mismatch_indices = np.where(~close_mask) + num_mismatches = len(mismatch_indices[0]) + max_acceptable = int(total_elements * max_mismatch_percentage / 100) + + elementwise_ok = num_mismatches <= max_acceptable + if not elementwise_ok: + print(f"ERROR: Output {idx} does not meet expected output.") + print(f"Shape: {expected.shape}") + if total_elements > 0: + print( + f"Mismatches: {num_mismatches} / {total_elements} elements " + f"({100*num_mismatches/total_elements:.2f}%)" + ) + if max_acceptable > 0: + print(f"Max acceptable: {max_acceptable} ({max_mismatch_percentage}%)") + _print_mismatches_dense(actual, expected, mismatch_indices, num_mismatches) + + # Correlation check + corr_ok = True + if min_correlation is not None and total_elements > 0: + corr = float(np.corrcoef(actual.flatten(), expected.flatten())[0, 1]) + print( + f"Output {idx} correlation: {corr:.6f} (threshold: {min_correlation})" + ) + if not np.isfinite(corr) or corr < min_correlation: + corr_ok = False + print( + f"ERROR: Output {idx} correlation {corr:.6f} below threshold {min_correlation}" + ) + + return 0 if (elementwise_ok and corr_ok) else 1 + else: + if not np.array_equal(actual, expected): + print(f"ERROR: Output {idx} does not meet expected output.") + mismatch_mask = actual != expected + mismatch_indices = np.where(mismatch_mask) + num_mismatches = len(mismatch_indices[0]) + print(f"Shape: {expected.shape}") + if total_elements > 0: + print( + f"Mismatches: {num_mismatches} / {total_elements} elements " + f"({100*num_mismatches/total_elements:.2f}%)" + ) + _print_mismatches_dense(actual, expected, mismatch_indices, num_mismatches) + return 1 + return 0 + + +def _print_mismatches_dense(actual, expected, mismatch_indices, num_mismatches): + max_display = 20 + print(f"First {min(max_display, num_mismatches)} mismatched locations:") + for j in range(min(max_display, num_mismatches)): + idx_t = tuple(dim[j] for dim in mismatch_indices) + if np.issubdtype(expected.dtype, np.floating): + print( + f" Index {idx_t}: expected={expected[idx_t]}, actual={actual[idx_t]}, " + f"diff={abs(actual[idx_t] - expected[idx_t])}" + ) + else: + print( + f" Index {idx_t}: expected={expected[idx_t]}, actual={actual[idx_t]}" + ) + if num_mismatches > max_display: + print(f" ... and {num_mismatches - max_display} more mismatches") + + +def _check_stochastic(actual, sref, rtol, atol, idx, max_mismatch_percentage): + """Stochastic spot-check. Returns number of errors (0 or 1).""" + if sref["values"][0].dtype in [np.float16, np.float32, np.float64, bfloat16]: + values = sref["values"] + if values[0].dtype == bfloat16: + values = values.astype(np.float64) + actual = actual.astype(np.float64) + actual_stochastic = actual[tuple(sref["indices"])] + close_mask = np.isclose(actual_stochastic, values, rtol=rtol, atol=atol) + mismatch_positions = np.where(~close_mask)[0] + num_mismatches = len(mismatch_positions) + total_elements = len(values) + max_acceptable = int(total_elements * max_mismatch_percentage / 100) + if num_mismatches > max_acceptable: + print(f"ERROR: Stochastic output {idx} does not meet expected output.") + print(f"Shape: {sref['shape']}") + print(f"Stochastic check: {total_elements} sampled elements") + print( + f"Mismatches: {num_mismatches} / {total_elements} elements " + f"({100*num_mismatches/total_elements:.2f}%)" + ) + if max_acceptable > 0: + print(f"Max acceptable: {max_acceptable} ({max_mismatch_percentage}%)") + max_display = 20 + print(f"First {min(max_display, num_mismatches)} mismatched locations:") + for j in range(min(max_display, num_mismatches)): + pos = mismatch_positions[j] + idx_t = tuple(dim[pos] for dim in sref["indices"]) + exp_val = values[pos] + act_val = actual_stochastic[pos] + print( + f" Index {idx_t}: expected={exp_val}, actual={act_val}, " + f"diff={abs(act_val - exp_val)}" + ) + if num_mismatches > max_display: + print(f" ... and {num_mismatches - max_display} more mismatches") + return 1 + return 0 + else: + actual_stochastic = actual[tuple(sref["indices"])] + if not np.array_equal(actual_stochastic, sref["values"]): + print(f"ERROR: Stochastic output {idx} does not meet expected output.") + mismatch_mask = actual_stochastic != sref["values"] + mismatch_positions = np.where(mismatch_mask)[0] + num_mismatches = len(mismatch_positions) + total_elements = len(sref["values"]) + print(f"Shape: {sref['shape']}") + print(f"Stochastic check: {total_elements} sampled elements") + if total_elements > 0: + print( + f"Mismatches: {num_mismatches} / {total_elements} elements " + f"({100*num_mismatches/total_elements:.2f}%)" + ) + max_display = 20 + print(f"First {min(max_display, num_mismatches)} mismatched locations:") + for j in range(min(max_display, num_mismatches)): + pos = mismatch_positions[j] + idx_t = tuple(dim[pos] for dim in sref["indices"]) + exp_val = sref["values"][pos] + act_val = actual_stochastic[pos] + print(f" Index {idx_t}: expected={exp_val}, actual={act_val}") + if num_mismatches > max_display: + print(f" ... and {num_mismatches - max_display} more mismatches") + return 1 + return 0 + + +# --------------------------------------------------------------------------- +# Backward compatibility shims +# --------------------------------------------------------------------------- + + +class XRTCompileArtifact: + """ + Deprecated. Use NPUKernel from compile_air() instead. + + This shim wraps NPUKernel so existing code that unpacks + .output_binary / .kernel / .insts still works. + """ + + def __init__(self, output_binary, kernel, insts): self.output_binary = output_binary self.kernel = kernel self.insts = insts -class XRTBackend(AirBackend): - """Main entry-point for the xrt based AIR backend.""" +class XRTBackend: + """ + Deprecated. Use compile_air() + get_air_runtime() instead. + + This shim delegates to compile_air() and get_air_runtime() so + existing code continues to work without modification. + """ def __init__( self, @@ -53,11 +693,11 @@ def __init__( target_device: str = None, omit_while_true_loop: bool = False, omit_pingpong: str = "", - lower_linalg_to_func: str = None, + lower_linalg_to_func=None, air_loop_fusion: bool = False, - runtime_loop_tiling_sizes: list[int] = [], + runtime_loop_tiling_sizes=None, omit_auto_broadcast: bool = False, - channel_multiplexing: list[str] = [], + channel_multiplexing=None, use_lock_race_condition_fix: bool = False, trace_offset: int = 0, trace_size: int = 0, @@ -70,51 +710,21 @@ def __init__( debug_ir: bool = False, bf16_emulation: bool = False, ): - """Constructor for XRTBackend - - Args: - verbose: verbose output - target_device: specify target device explicitly ("npu1", "npu2", etc.). If None, will attempt auto-detection via xrt-smi. This parameter is useful when compiling without XRT installed. - omit_while_true_loop: configure aircc to omit the while true loop it traditionally emits. - omit_pingpong: configure aircc to omit the generation of ping-pong buffering for specific memory levels. Supported values: "", "L1", "L2", "all". Empty string means no omission (default). - lower_linalg_to_func: configure aircc to lower linalg.generic to function calls, or loops. - air_loop_fusion: configure aircc to add air-loop-fusion experimental pass. - runtime_loop_tiling_sizes: configure aircc to add extra runtime loop tiling using the experimental affine-loop-opt pass. - omit_auto_broadcast: configure aircc to omit the detection and lowering of broadcast data movements. - channel_multiplexing: configure aircc to perform air channel multiplexing on specified memroy spaces. - use_lock_race_condition_fix: configure aircc to enable a fix for lock race condition which protects against race condition. - trace_offset: configure aircc to stream out profiling traces at outputs, starting from the specified offset. - trace_size: configure aircc to stream out profiling traces at outputs, with specified trace data size. - output_format: configure aircc to produce output binary in to one of the following formats: [xclbin, txn, elf]. - kernel_name: configure aircc to package the kernel with the specified name. - instance_name: configure aircc to package the kernel with specified instance name in xclbin metadata. - kernel_id: configure aircc to package the kernel with specified kernel id in xclbin file. - xclbin_input: configure aircc to package the kernel into an existing xclbin with specified xclbin file name. - num_device_cols: number of device columns to confine the design within (0 means entire device, default). - For npu1 (4 columns total): valid values are 0 (entire device), 1, 2, 3 - For npu2 (8 columns total): valid values are 0 (entire device), 1, 2, 3, 4, 5, 6, 7 - debug_ir: enable debug mode to emit IR after each individual pass for fine-grained inspection. - IRs are saved to /debug_ir/ with sequence numbers. - bf16_emulation: emulate f32 vector arithmetic using bf16 operations. - """ - super().__init__() self.verbose = verbose self.target_device = target_device - self.omit_while_true_loop = omit_while_true_loop - # Support backward compatibility: convert True to "all", False to "" if isinstance(omit_pingpong, bool): self.omit_pingpong = "all" if omit_pingpong else "" else: self.omit_pingpong = omit_pingpong + self.omit_while_true_loop = omit_while_true_loop self.lower_linalg_to_func = lower_linalg_to_func self.air_loop_fusion = air_loop_fusion - self.runtime_loop_tiling_sizes = runtime_loop_tiling_sizes + self.runtime_loop_tiling_sizes = runtime_loop_tiling_sizes or [] self.omit_auto_broadcast = omit_auto_broadcast - self.channel_multiplexing = channel_multiplexing + self.channel_multiplexing = channel_multiplexing or [] self.use_lock_race_condition_fix = use_lock_race_condition_fix self.trace_offset = trace_offset self.trace_size = trace_size - self.currently_loaded = False self.output_format = output_format self.kernel_name = kernel_name self.instance_name = instance_name @@ -123,6 +733,19 @@ def __init__( self.num_device_cols = num_device_cols self.debug_ir = debug_ir self.bf16_emulation = bf16_emulation + # Legacy attributes referenced by some callers + self._npu_kernel = None + self._handle = None + self._runtime = None + self.currently_loaded = False + # These were set as side-effects of load() + self.xclbin = None + self.elf = None + self.device = None + self.context = None + self.kernel = None + self.bo_instr = None + self.instr_v = None def __del__(self): self.unload() @@ -134,467 +757,86 @@ def compile( kernel="MLIR_AIE", insts="air.insts.bin", ): - """Compiles an AIR module for the NPU / XRT Runtime with aircc. - - The module is expected to be AIR dialect IR. The input IR is passed directly to aircc. - - Args: - air_module: The MLIR module consisting of funcs in the AIR dialect. - output_binary_name: base name for the output binary (without extension). - Extension is determined by output_format: .xclbin, .elf, or .txn - kernel: kernel name to use - insts: instruction filename to use - Returns: - An XRTCompileArtifact object - """ + """Compile an AIR module. Returns XRTCompileArtifact for compat.""" + npu_kernel = compile_air( + air_module, + verbose=self.verbose, + target_device=self.target_device, + omit_while_true_loop=self.omit_while_true_loop, + omit_pingpong=self.omit_pingpong, + lower_linalg_to_func=self.lower_linalg_to_func, + air_loop_fusion=self.air_loop_fusion, + runtime_loop_tiling_sizes=self.runtime_loop_tiling_sizes, + omit_auto_broadcast=self.omit_auto_broadcast, + channel_multiplexing=self.channel_multiplexing, + use_lock_race_condition_fix=self.use_lock_race_condition_fix, + trace_offset=self.trace_offset, + trace_size=self.trace_size, + output_format=self.output_format, + kernel_name=self.kernel_name, + instance_name=self.instance_name, + kernel_id=self.kernel_id, + xclbin_input=self.xclbin_input, + num_device_cols=self.num_device_cols, + debug_ir=self.debug_ir, + bf16_emulation=self.bf16_emulation, + output_binary_name=output_binary_name, + insts=insts, + ) + self._npu_kernel = npu_kernel + # Build a compat artifact + xclbin_path = getattr(npu_kernel, "xclbin_path", output_binary_name) + kernel_name = getattr(npu_kernel, "kernel_name", kernel) + insts_path = getattr(npu_kernel, "insts_path", insts) + return XRTCompileArtifact(xclbin_path, kernel_name, insts_path) + + def load(self, artifact): + """Load a compiled artifact. Returns an invoker callable.""" if self.currently_loaded: raise AirBackendError( - "Cannot use XRTBackend to compile while the artifact is currently loaded. Call unload() first." + "Cannot load while an artifact is currently loaded. Call unload() first." ) - - # Determine target device: use explicit parameter if provided, otherwise auto-detect - if self.target_device is not None: - target_device = self.target_device - if self.verbose: - print(f"Using explicitly specified target device: {target_device}") - else: - # Try to auto-detect device via xrt-smi - target_device = "npu1" # Default fallback - try: - import re - - xrtsmi = "/opt/xilinx/xrt/bin/xrt-smi" - result = subprocess.run( - [xrtsmi, "examine"], stdout=subprocess.PIPE, stderr=subprocess.PIPE - ) - result = result.stdout.decode("utf-8").split("\n") - # Older format is "|[0000:41:00.1] ||RyzenAI-npu1 |" - # Newer format is "|[0000:41:00.1] |NPU Phoenix |" - p = re.compile( - r"[\|]?(\[.+:.+:.+\]).+\|(RyzenAI-(npu\d)|NPU (\w+))\W*\|" + if self._npu_kernel is None: + # Reconstruct NPUKernel from the artifact for the case where + # compile() was called separately. + if _HAS_AIE_RUNTIME: + self._npu_kernel = NPUKernel( + artifact.output_binary, + artifact.insts, + kernel_name=artifact.kernel, ) - for l in result: - m = p.match(l) - if not m: - continue - if self.verbose: - print("Found Ryzen AI device:", m.group(1)) - model = "unknown" - if m.group(3): - model = str(m.group(3)) - if m.group(4): - model = str(m.group(4)) - if self.verbose: - print(f"\tmodel: '{model}'") - if model in ["npu1", "Phoenix"]: - target_device = "npu1" - elif model in ["npu4", "Strix"]: - target_device = "npu2" - else: - print("WARNING: xrt-smi reported unknown NPU model '{model}'.") - break - except Exception as e: - if self.verbose: - print("Failed to run xrt-smi, using default target device") - print(e) - - # Validate output_format compatibility with target device - if self.output_format == "elf" and "npu1" in target_device: - raise AirBackendError( - f"output_format='elf' is not supported for {target_device} target. " - "ELF output format is only supported on npu2 and later devices." - ) - - # Apply user-specified device column configuration if provided - if self.num_device_cols > 0: - # Validate column count based on detected device - max_cols = 4 if target_device == "npu1" else 8 - if self.num_device_cols > max_cols - 1: - raise AirBackendError( - f"Invalid num_device_cols value: {self.num_device_cols}. " - f"For {target_device}, valid values are 0 (entire device) or 1-{max_cols-1}" - ) - base_device = target_device - target_device = f"{target_device}_{self.num_device_cols}col" - if self.verbose: - print( - f"Confining design to {self.num_device_cols} column(s) of {base_device} device: {target_device}" - ) - - import os, site, glob - - # Try to get peano package dir from environment variable, fallback to site-packages - peano_package_dir = os.environ.get("PEANO_INSTALL_DIR", "") - - if peano_package_dir and os.path.isdir(peano_package_dir): - print( - "XRTBackend: llvm-aie package detected via PEANO_INSTALL_DIR:", - peano_package_dir, - ) - - # Determine output file extension based on output_format - if self.output_format == "elf": - output_binary = f"{output_binary_name}.elf" - elif self.output_format == "txn": - output_binary = f"{output_binary_name}.txn" - else: # xclbin (default) - output_binary = f"{output_binary_name}.xclbin" - - with air.ir.Context(): - - if self.verbose: - print("AIR Module:") - print(air_module) - - aircc_options = [ - "--device", - target_device, - "air.mlir", - ] - - # Add output file options based on format - if self.output_format == "elf": - aircc_options += ["--elf-name", output_binary] - # Note: ELF mode features (main device wrapper, load_pdi) are - # automatically enabled by --output-format=elf in aircc - else: - aircc_options += ["-o", output_binary] - aircc_options += ["-i", insts] - - for s in self.runtime_loop_tiling_sizes: - aircc_options += [f"--air-runtime-loop-tiling-sizes={s}"] - - if self.verbose: - aircc_options = aircc_options + ["-v"] - - if self.omit_while_true_loop: - aircc_options += ["--omit-while-true-loop"] - - if self.omit_pingpong: - # Handle both bool (True -> "all") and string ("L1", "L2", "all") - pp_val = ( - "all" if self.omit_pingpong is True else str(self.omit_pingpong) - ) - aircc_options += [f"--omit-ping-pong-transform={pp_val}"] - - if self.lower_linalg_to_func: - aircc_options += ["--lower-linalg-to-func"] - aircc_options += [self.lower_linalg_to_func] - - if self.air_loop_fusion: - aircc_options += ["--air-loop-fusion"] - - if self.omit_auto_broadcast: - aircc_options += ["--omit-auto-broadcast"] - - if len(self.channel_multiplexing) != 0: - for ch in self.channel_multiplexing: - aircc_options += [f"--air-channel-multiplexing={ch}"] - - if self.use_lock_race_condition_fix: - aircc_options += ["--use-lock-race-condition-fix"] - - if self.trace_size != 0: - aircc_options += ["-trace-size"] - aircc_options += [str(self.trace_size)] - aircc_options += ["-trace-offset"] - aircc_options += [str(self.trace_offset)] - - if self.output_format != "": - aircc_options += ["--output-format"] - aircc_options += [self.output_format] - if self.kernel_name != "": - aircc_options += ["--xclbin-kernel-name"] - aircc_options += [self.kernel_name] - if self.instance_name != "": - aircc_options += ["--xclbin-instance-name"] - aircc_options += [self.instance_name] - if self.kernel_id != "": - aircc_options += ["--xclbin-kernel-id"] - aircc_options += [self.kernel_id] - if self.xclbin_input != "": - aircc_options += ["--xclbin-input"] - aircc_options += [self.xclbin_input] - if peano_package_dir != "": - aircc_options += ["--peano"] - aircc_options += [peano_package_dir] - aircc_options += ["--no-xchesscc"] - aircc_options += ["--no-xbridge"] - else: - aircc_options += ["--xchesscc"] - aircc_options += ["--xbridge"] - - if self.debug_ir: - aircc_options += ["--debug-ir"] - - if self.bf16_emulation: - aircc_options += ["--bf16-emulation"] - - if self.verbose: - print("Running aircc with options:", " ".join(aircc_options)) - - # Write the in-memory module to the input file expected by aircc - with open("air.mlir", "w") as f: - f.write(str(air_module)) - - # Invoke the C++ aircc binary - aircc_exe = shutil.which("aircc") - if not aircc_exe: - raise AirBackendError( - "aircc binary not found in PATH. " - "Ensure mlir-air is installed and aircc is on PATH." - ) - result = subprocess.run( - [aircc_exe] + aircc_options, - capture_output=True, - text=True, - ) - if result.returncode != 0: - error_msg = result.stderr if result.stderr else result.stdout - raise AirBackendError(f"aircc compilation failed:\n{error_msg}") - - # For ELF mode, the kernel identifier is "main:instance_name" - # This is used when loading the ELF via xrt.ext.kernel() - if self.output_format == "elf" and self.instance_name != "": - elf_kernel = f"main:{self.instance_name}" - else: - elf_kernel = kernel - - return XRTCompileArtifact(output_binary, elf_kernel, insts) - - def compile_from_torch_mlir( - self, - imported_module, - pipeline=None, - verbose=False, - ): - import torch_mlir - import torch_mlir.passmanager - - if type(imported_module) is torch_mlir.ir.Module: - with imported_module.operation.context: - pm = torch_mlir.passmanager.PassManager.parse( - "builtin.module(refback-mlprogram-bufferize)" - ) - pm.run(imported_module.operation) - - with air.ir.Context(): - linalg_module = air.ir.Module.parse(str(imported_module)) - pm = air.passmanager.PassManager.parse( - air.compiler.util.LINALG_TENSOR_TO_MEMREF_PIPELINE - ) - if verbose: - print( - "Running MLIR pass pipeline: ", - air.compiler.util.LINALG_TENSOR_TO_MEMREF_PIPELINE, - ) - pm.run(linalg_module.operation) - - if verbose: - print("Linalg Module:") - print(linalg_module) - - DEFAULT_PIPELINE = ( - "builtin.module(" - + ",".join( - [ - "buffer-results-to-out-params", - "air-linalg-codegen", - "air-par-to-herd{depth=-1}", - "air-par-to-launch{has-air-segment=true}", - "air-copy-to-dma", - "canonicalize", - "cse", - ] - ) - + ")" - ) - if pipeline is None: - pipeline = DEFAULT_PIPELINE - - if callable(pipeline): - air_module = pipeline(linalg_module) - else: - pm = air.passmanager.PassManager.parse(pipeline) - pm.run(linalg_module.operation) - air_module = linalg_module - - if verbose: - print("Air Module:") - print(air_module) - - return self.compile(air_module) - - def load(self, artifact: XRTCompileArtifact): - """Load a compiled artifact into the air runtime. - - Args: - artifact: The result of calling compile with XRTBackend on an MLIR-AIR module. - Supports both xclbin and ELF formats. - - Returns: A callable that can be used to invoke the loaded module. - The callable takes a list of numpy arrays. Each numpy array is - assumed to be an input/output tensor. The callable also returns a - list of numpy arrays, one for each tensor. - """ - # Try to import pyxrt - it's only needed for load(), not compile() - try: - import pyxrt as xrt - except ImportError: - raise AirBackendError( - "XRT runtime (pyxrt) is not available. " - "The compile() method can generate artifacts without XRT, " - "but load() requires XRT to be installed for hardware execution. " - "To compile without XRT, use compile() and specify target_device parameter. " - "Install XRT to use load() for hardware execution." - ) - - if self.currently_loaded: - raise AirBackendError( - "Cannot use XRTBackend to compile while the artifact is currently loaded. Call unload() first." - ) + self._runtime = get_air_runtime() + self._handle = self._runtime.load(self._npu_kernel) + self.currently_loaded = True - if not os.path.isfile(artifact.output_binary): - raise AirBackendError( - f"Cannot load XRTCompileArtifact because {artifact.output_binary} file does not exist" - ) + # Return a callable that mimics the old invoker interface. + # Use _tensor() factory (selects XRTTensor or CPUOnlyTensor based on + # pyxrt availability) rather than XRTTensor() directly. + runtime = self._runtime + handle = self._handle - # Determine the loading mode based on file extension - is_elf = artifact.output_binary.endswith(".elf") - - # create the device - self.device = xrt.device(0) - - if is_elf: - # ELF loading path - uses experimental APIs - # No instruction file needed for ELF (instructions embedded in ELF) - try: - self.elf = xrt.elf(artifact.output_binary) - self.context = xrt.hw_context(self.device, self.elf) - self.kernel = xrt.ext.kernel(self.context, artifact.kernel) - except Exception as e: - raise AirBackendError( - f"Failed to load ELF kernel for XRT from '{artifact.output_binary}' " - f"with kernel name '{artifact.kernel}'. " - "Ensure this file is a valid ELF binary compiled for the target device " - "and that it contains a kernel symbol matching the provided name." - ) from e - self.bo_instr = None # Not needed for ELF - self.instr_v = None - - def invoker(*args): - sizes_in_bytes = [a.size * a.itemsize for a in args] - # Use xrt.ext.bo for ELF mode (simpler, no group_id needed) - bos = [xrt.ext.bo(self.device, s) for s in sizes_in_bytes] - - for i, a in enumerate(args): - if a.dtype == bfloat16: - a = a.view(np.int16) - bos[i].write(a, 0) - bos[i].sync(xrt.xclBOSyncDirection.XCL_BO_SYNC_BO_TO_DEVICE) - - # Use xrt.run for ELF mode - run = xrt.run(self.kernel) - for i, bo in enumerate(bos): - run.set_arg(i, bo) - run.start() - run.wait2() - - for i, a in enumerate(args): - bos[i].sync(xrt.xclBOSyncDirection.XCL_BO_SYNC_BO_FROM_DEVICE) - return tuple( - [ - bos[i].read(s, 0).view(args[i].dtype) - for i, s in enumerate(sizes_in_bytes) - ] - ) + def invoker(*args): + import filelock - else: - # xclbin loading path - original implementation - if not os.path.isfile(artifact.insts): - raise AirBackendError( - f"Cannot load XRTCompileArtifact because {artifact.insts} insts file does not exist" - ) + tensors = [_tensor(a) for a in args] + with filelock.FileLock("/tmp/npu.lock"): + runtime.run(handle, tensors) + return tuple(t.numpy() for t in tensors) - self.xclbin = xrt.xclbin(artifact.output_binary) - self.device.register_xclbin(self.xclbin) - self.context = xrt.hw_context(self.device, self.xclbin.get_uuid()) - - # find and load the kernel - kernels = self.xclbin.get_kernels() - try: - xkernel = [k for k in kernels if artifact.kernel in k.get_name()][0] - except: - raise AirBackendError( - f"Kernel '{artifact.kernel}' not found in '{artifact.output_binary}'" - ) - self.kernel = xrt.kernel(self.context, xkernel.get_name()) - - # load the instructions as a numpy array - with open(artifact.insts, "rb") as f: - instr_data = f.read() - self.instr_v = np.frombuffer(instr_data, dtype=np.uint32) - - self.bo_instr = xrt.bo( - self.device, - len(self.instr_v) * 4, - xrt.bo.cacheable, - self.kernel.group_id(1), - ) - self.bo_instr.write(self.instr_v, 0) - - def invoker(*args): - # limit arg length to 5 - if len(args) > 5: - raise ValueError("Too many arguments") - sizes_in_bytes = [a.size * a.itemsize for a in args] - bos = [ - xrt.bo( - self.device, s, xrt.bo.host_only, self.kernel.group_id(i + 3) - ) - for i, s in enumerate(sizes_in_bytes) - ] - - self.bo_instr.sync(xrt.xclBOSyncDirection.XCL_BO_SYNC_BO_TO_DEVICE) - for i, a in enumerate(args): - if a.dtype == bfloat16: - # store bfloat16 in binary as int16 - a = a.view(np.int16) - bos[i].write(a, 0) - bos[i].sync(xrt.xclBOSyncDirection.XCL_BO_SYNC_BO_TO_DEVICE) - - h = self.kernel(3, self.bo_instr, len(self.instr_v), *bos) - h.wait() - - for i, a in enumerate(args): - bos[i].sync(xrt.xclBOSyncDirection.XCL_BO_SYNC_BO_FROM_DEVICE) - return tuple( - [ - bos[i].read(s, 0).view(args[i].dtype) - for i, s in enumerate(sizes_in_bytes) - ] - ) - - self.currently_loaded = True return invoker def compile_and_load(self, module): - """ - Compile and load a module in one step. - - Args: - air_module: The MLIR module consisting of funcs in the AIR dialect. - - Returns: A callable that can be used to invoke the loaded module. - The callable takes a list of numpy arrays. Each numpy array is - assumed to be an input/output tensor. The callable also returns a - list of numpy arrays, one for each tensor. - """ + """Compile and load in one step.""" c = self.compile(module) return self.load(c) def unload(self): - """Unload any loaded module and shutdown the air runtime.""" + """Unload any loaded module.""" + self._handle = None + self._runtime = None + self._npu_kernel = None + self.currently_loaded = False + # Clear legacy attributes self.kernel = None self.context = None self.xclbin = None @@ -602,4 +844,3 @@ def unload(self): self.device = None self.bo_instr = None self.instr_v = None - self.currently_loaded = False diff --git a/python/air/backend/xrt_runner.py b/python/air/backend/xrt_runner.py index b9fe9aae0..0a9dc5925 100644 --- a/python/air/backend/xrt_runner.py +++ b/python/air/backend/xrt_runner.py @@ -3,14 +3,57 @@ # Copyright (C) 2024, Advanced Micro Devices, Inc. # SPDX-License-Identifier: MIT -import numpy as np -from .xrt import XRTBackend -from air.dialects.air import * +""" +High-level runner helpers for mlir-air programming examples and tests. + +Public API +---------- +run_on_npu(args, mlir_module, inputs, instance_name, ...) + Compile (and optionally run) an AIR module, dispatching on + args.compile_mode. Replaces the boilerplate if/elif block found + in every example's __main__. + +make_air_parser(description, prog) + Return an ArgumentParser pre-populated with the four universal flags. + +type_mapper(np_dtype) + Map a numpy dtype to the corresponding MLIR type inside a module context. + +TYPE_MAP_DICT + The underlying defaultdict used by type_mapper. +""" + import filelock -from typing import List +import numpy as np from collections import defaultdict from ml_dtypes import bfloat16 -import timeit +from typing import List + +from air.dialects.air import * + +from .abc import AirBackendError +from .xrt import ( + compile_air, + get_air_runtime, + AirRuntime, + XRTTensor, + XRTBackend, + XRTCompileArtifact, +) + +try: + import aie.utils as _aie_utils + + _tensor = _aie_utils.tensor + _has_aie_utils = True +except ImportError: + _has_aie_utils = False + _tensor = None + + +# --------------------------------------------------------------------------- +# Type mapping helpers (unchanged — used by many callers) +# --------------------------------------------------------------------------- TYPE_MAP_DICT = defaultdict( lambda: None, @@ -35,36 +78,217 @@ def type_mapper(np_dtype): - """ - This function is meant to run within a module context (e.g., with a function wrapped with @build_module) - args: - np_dtype: the numpy data type to map - return: - The data type to run on the npu + """Map a numpy dtype to the MLIR type for use inside a module context. + + Args: + np_dtype: The numpy data type to map. + + Returns: + The corresponding MLIR type. + + Raises: + AirBackendError: If the dtype has no known mapping. """ xrt_dtype = TYPE_MAP_DICT[np_dtype]() if xrt_dtype is None: raise AirBackendError(f"numpy data type {np_dtype} has no default mapping") elif xrt_dtype.width / 8 != np.dtype(np_dtype).itemsize: - # This is a sanity check on the TYPE_MAP_DICT rather than a check on the user input raise AirBackendError( - f"Python data type has width {xrt_dtype.width / 8} but numpy data type has width {np.dtype(np_dtype).itemsize}" + f"Python data type has width {xrt_dtype.width / 8} but numpy data type " + f"has width {np.dtype(np_dtype).itemsize}" ) return xrt_dtype +# --------------------------------------------------------------------------- +# Argument parser factory (unchanged) +# --------------------------------------------------------------------------- + + +def make_air_parser(description, prog="run.py"): + """Return an ArgumentParser pre-populated with the four universal flags. + + Flags added: + -v / --verbose + -p / --print-module-only + --compile-mode {compile-only, compile-and-run} + --output-format {xclbin, elf} + + The caller adds any example-specific arguments afterwards. + """ + import argparse + + parser = argparse.ArgumentParser(prog=prog, description=description) + parser.add_argument("-v", "--verbose", action="store_true") + parser.add_argument("-p", "--print-module-only", action="store_true") + parser.add_argument( + "--compile-mode", + type=str, + choices=["compile-only", "compile-and-run"], + dest="compile_mode", + default="compile-and-run", + ) + parser.add_argument( + "--output-format", + type=str, + choices=["xclbin", "elf"], + default="xclbin", + dest="output_format", + ) + return parser + + +# --------------------------------------------------------------------------- +# check_print_module helper (kept here for backward compat) +# --------------------------------------------------------------------------- + + +def check_print_module(mlir_module, args): + """Print the MLIR module and exit if --print-module-only was passed.""" + if args.print_module_only: + print(mlir_module) + exit(0) + + +# --------------------------------------------------------------------------- +# run_on_npu() — the main dispatch helper +# --------------------------------------------------------------------------- + + +def run_on_npu( + args, + mlir_module, + inputs, + instance_name, + expected_outputs=None, + stochastic_expected_outputs=None, + rtol: float = 1e-3, + atol: float = 1e-8, + runtime_loop_tiling_sizes=None, + max_mismatch_percentage: float = 0.0, + min_correlation=None, + # Extra compile_air kwargs forwarded as-is + **compile_kwargs, +) -> int: + """Compile (and optionally run+verify) an AIR module. + + Dispatches on args.compile_mode: + - "compile-only" → compile, write artifacts, return 0 + - "compile-and-run" → compile, run on NPU, verify, return exit code + + Args: + args: Parsed argparse namespace (must have .verbose, .compile_mode, + .output_format, and optionally .print_module_only). + mlir_module: MLIR module from build_module() / Module.parse(). + inputs: List of numpy input arrays. + instance_name: xclbin instance name string. + expected_outputs: List of numpy reference arrays (dense check). + stochastic_expected_outputs: List of {"shape","indices","values"} dicts. + rtol: Relative tolerance forwarded to AirRuntime.run_test(). + atol: Absolute tolerance forwarded to AirRuntime.run_test(). + runtime_loop_tiling_sizes: Tiling sizes (default [4, 4]). + max_mismatch_percentage: Max % of elements allowed to mismatch. + min_correlation: Minimum Pearson correlation (None = disabled). + **compile_kwargs: Additional kwargs forwarded to compile_air(). + + Returns: + int: 0 = pass / compile-only success, -1 = failure. + """ + if runtime_loop_tiling_sizes is None: + runtime_loop_tiling_sizes = [4, 4] + + # --print-module-only support + if getattr(args, "print_module_only", False): + print(mlir_module) + return 0 + + npu_kernel = compile_air( + mlir_module, + verbose=args.verbose, + output_format=args.output_format, + omit_while_true_loop=False, + instance_name=instance_name, + runtime_loop_tiling_sizes=runtime_loop_tiling_sizes, + **compile_kwargs, + ) + + if args.compile_mode == "compile-only": + return 0 + + # compile-and-run + runtime = get_air_runtime() + + # Build io_args: inputs + zero-initialised output buffers. + # Use aie.utils.tensor() so the correct tensor class is picked automatically. + input_tensors = [_tensor(a) if _has_aie_utils else a for a in inputs] + output_tensors = _make_output_tensors( + expected_outputs or [], stochastic_expected_outputs or [] + ) + io_args = input_tensors + output_tensors + + # Build refs dict: map output buffer index → expected numpy array + refs = {len(inputs) + i: exp for i, exp in enumerate(expected_outputs or [])} + + return runtime.run_test( + npu_kernel, + io_args, + refs=refs, + rtol=rtol, + atol=atol, + max_mismatch_percentage=max_mismatch_percentage, + min_correlation=min_correlation, + stochastic_refs=stochastic_expected_outputs or [], + ) + + +def _make_output_tensors(expected_outputs, stochastic_expected_outputs): + """Allocate zero-filled tensors matching each expected output spec. + + Uses aie.utils.tensor() so the correct tensor class (XRTTensor when XRT + is available, CPUOnlyTensor otherwise) is selected automatically. + """ + tensors = [] + for exp in expected_outputs: + if _has_aie_utils: + tensors.append(_tensor(np.zeros(exp.shape, dtype=exp.dtype))) + else: + tensors.append(np.zeros(exp.shape, dtype=exp.dtype)) + for sref in stochastic_expected_outputs: + dtype = sref["values"].dtype if hasattr(sref["values"], "dtype") else np.float32 + shape = sref["shape"] + if isinstance(shape, int): + shape = (shape,) + if _has_aie_utils: + tensors.append(_tensor(np.zeros(shape, dtype=dtype))) + else: + tensors.append(np.zeros(shape, dtype=dtype)) + return tensors + + +# --------------------------------------------------------------------------- +# Backward compatibility shim — XRTRunner +# --------------------------------------------------------------------------- + + class XRTRunner: + """ + Deprecated. Use compile_air() + get_air_runtime() or run_on_npu() instead. + + This shim preserves the old XRTRunner interface so existing call sites + continue to work without modification during the migration period. + """ + def __init__( self, verbose: bool = False, omit_while_true_loop: bool = True, omit_pingpong: str = "", - lower_linalg_to_func: bool = False, + lower_linalg_to_func=None, air_loop_fusion: bool = False, - runtime_loop_tiling_sizes: list[int] = [], + runtime_loop_tiling_sizes: list = None, omit_auto_broadcast: bool = False, - channel_multiplexing: list[str] = [], + channel_multiplexing: list = None, use_lock_race_condition_fix: bool = False, trace_offset: int = 0, trace_size: int = 0, @@ -79,45 +303,18 @@ def __init__( bf16_emulation: bool = False, target_device: str = None, ): - """ - Args: - verbose: verbose output - omit_while_true_loop: configure aircc to omit the while true loop it traditionally emits. - omit_pingpong: configure aircc to omit the generation of ping-pong buffering for specific memory levels. Supported values: "", "L1", "L2", "all". Empty string means no omission (default). - lower_linalg_to_func: configure aircc to lower linalg.generic to function calls, or loops. - air_loop_fusion: configure aircc to add air-loop-fusion experimental pass. - runtime_loop_tiling_sizes: configure aircc to add extra runtime loop tiling using the experimental affine-loop-opt pass. - omit_auto_broadcast: configure aircc to omit the detection and lowering of broadcast data movements. - channel_multiplexing: configure aircc to perform air channel multiplexing on specified memroy spaces. - use_lock_race_condition_fix: configure aircc to enable a fix for lock race condition which protects against race condition. - trace_offset: configure aircc to stream out profiling traces at outputs, starting from the specified offset. - trace_size: configure aircc to stream out profiling traces at outputs, with specified trace data size. - output_format: configure aircc to produce output binary in to one of the following formats: [xclbin, txn, elf]. - kernel_name: configure aircc to package the kernel with the specified name. - instance_name: configure aircc to package the kernel with specified instance name in xclbin metadata. - kernel_id: configure aircc to package the kernel with specified kernel id in xclbin file. - xclbin_input: configure aircc to package the kernel into an existing xclbin with specified xclbin file name. - trace_file: default filename for saving trace data. - num_device_cols: number of device columns to confine the design within (0 means entire device, default). - For npu1 (4 columns total): valid values are 0 (entire device), 1, 2, 3 - For npu2 (8 columns total): valid values are 0 (entire device), 1, 2, 3, 4, 5, 6, 7 - debug_ir: enable debug mode to emit IR after each individual pass for fine-grained inspection. - IRs are saved to /debug_ir/ with sequence numbers. - bf16_emulation: emulate f32 vector arithmetic using bf16 operations. - target_device: specify target device explicitly ("npu1", "npu2", etc.). If None, will attempt auto-detection. - """ self.verbose = verbose - self.omit_while_true_loop = omit_while_true_loop - # Support backward compatibility: convert True to "all", False to "" + # Support backward compatibility: convert bool omit_pingpong if isinstance(omit_pingpong, bool): self.omit_pingpong = "all" if omit_pingpong else "" else: self.omit_pingpong = omit_pingpong + self.omit_while_true_loop = omit_while_true_loop self.lower_linalg_to_func = lower_linalg_to_func self.air_loop_fusion = air_loop_fusion - self.runtime_loop_tiling_sizes = runtime_loop_tiling_sizes + self.runtime_loop_tiling_sizes = runtime_loop_tiling_sizes or [] self.omit_auto_broadcast = omit_auto_broadcast - self.channel_multiplexing = channel_multiplexing + self.channel_multiplexing = channel_multiplexing or [] self.use_lock_race_condition_fix = use_lock_race_condition_fix self.trace_offset = trace_offset self.trace_size = trace_size @@ -134,34 +331,45 @@ def __init__( def run_test( self, - mlir_module: np.ndarray, + mlir_module, inputs: List[np.ndarray], - expected_outputs: List[np.ndarray] = [], - stochastic_expected_outputs: List[np.ndarray] = [], + expected_outputs: List[np.ndarray] = None, + stochastic_expected_outputs: List = None, rtol: float = 1e-3, atol: float = 1e-8, max_mismatch_percentage: float = 0, - min_correlation: float = None, + min_correlation=None, trace_file: str = None, - ): - """ + ) -> int: + """Compile, run and verify an AIR module. + Args: - mlir_module: input mlir module to test. - inputs: input matrices. - expected_outputs: expected output matrices. - stochastic_expected_outputs: expected output matrices stored in sparse coordinates. Expect each matrix to be a dictionary containing "shape", "indices" and "values" fields. - rtol: relative error tolerance. - atol: absolute error tolerance. - max_mismatch_percentage: max percentage (0-100) of elements allowed to exceed tolerance (0 = all must pass, 20 = 20% can fail). - min_correlation: minimum Pearson correlation coefficient (0-1) between actual and expected outputs for floating-point data. None disables this check. - trace_file: optional override for trace data filename. If None, uses instance default. + mlir_module: MLIR module to test. + inputs: Input numpy arrays. + expected_outputs: Expected dense output arrays. + stochastic_expected_outputs: Sparse reference dicts. + rtol: Relative tolerance. + atol: Absolute tolerance. + max_mismatch_percentage: Max % of mismatches tolerated. + min_correlation: Min Pearson correlation (None = disabled). + trace_file: Override trace data filename. + + Returns: + 0 on pass, -1 on failure. """ + if expected_outputs is None: + expected_outputs = [] + if stochastic_expected_outputs is None: + stochastic_expected_outputs = [] + if self.verbose: - print("Running module: ") + print("Running module:") print(mlir_module) - backend = XRTBackend( + npu_kernel = compile_air( + mlir_module, verbose=self.verbose, + target_device=self.target_device, omit_while_true_loop=self.omit_while_true_loop, omit_pingpong=self.omit_pingpong, lower_linalg_to_func=self.lower_linalg_to_func, @@ -180,393 +388,159 @@ def run_test( num_device_cols=self.num_device_cols, debug_ir=self.debug_ir, bf16_emulation=self.bf16_emulation, - target_device=self.target_device, ) - # Use per-test trace file if provided, otherwise use instance default - active_trace_file = trace_file if trace_file is not None else self.trace_file - - # run the module - slots are input/output for now, assume non-overlapping inputs/outputs - # Handle different scenarios for trace data + # Handle trace mode separately (uses legacy raw-numpy path for now) if self.trace_size > 0: - if expected_outputs: - # Case 1: Both outputs and trace - # Add trace_size bytes to first output - total_bytes = expected_outputs[0].nbytes + self.trace_size - first_output_with_trace = np.zeros(total_bytes, dtype=np.uint8) - remaining_outputs = [ - np.zeros(o.shape, o.dtype) for o in expected_outputs[1:] - ] - output_placeholders = [first_output_with_trace] + remaining_outputs - if self.verbose: - print( - f"Allocated {total_bytes} bytes for first output + {self.trace_size} bytes for trace data" - ) - # Record the expected_outputs[0]'s shape and dtype, to be used to split actual outputs from trace. - expected_outputs_0_shape = expected_outputs[0].shape - expected_outputs_0_dtype = expected_outputs[0].dtype - elif stochastic_expected_outputs: - # Case 2: Stochastic outputs and trace - first_output_elements = np.prod(stochastic_expected_outputs[0]["shape"]) - first_output_bytes = ( - first_output_elements - * stochastic_expected_outputs[0]["values"][0].dtype.itemsize - ) - total_bytes = first_output_bytes + self.trace_size - first_output_with_trace = np.zeros(total_bytes, dtype=np.uint8) - remaining_outputs = [ - np.zeros(o["shape"], o["values"][0].dtype) - for o in stochastic_expected_outputs[1:] - ] - output_placeholders = [first_output_with_trace] + remaining_outputs - if self.verbose: - print( - f"Allocated {first_output_bytes} bytes for first stochastic output + {self.trace_size} bytes for trace data" - ) - # Record the expected_outputs[0]'s shape and dtype, to be used to split actual outputs from trace. - expected_outputs_0_shape = stochastic_expected_outputs[0]["shape"] - expected_outputs_0_dtype = stochastic_expected_outputs[0][ - "values" - ].dtype - else: - # Case 3: Trace only, no expected outputs - trace_only_output = np.zeros(self.trace_size, dtype=np.uint8) - output_placeholders = [trace_only_output] - if self.verbose: - print( - f"Trace-only mode: allocated {self.trace_size} bytes for trace data" - ) + return self._run_with_trace( + npu_kernel, + inputs, + expected_outputs, + stochastic_expected_outputs, + rtol, + atol, + max_mismatch_percentage, + trace_file or self.trace_file, + ) + + # Standard (no-trace) path + runtime = get_air_runtime() + input_tensors = [_tensor(a) if _has_aie_utils else a for a in inputs] + output_tensors = _make_output_tensors( + expected_outputs, stochastic_expected_outputs + ) + io_args = input_tensors + output_tensors + + # Build refs dict + refs = {len(inputs) + i: exp for i, exp in enumerate(expected_outputs)} + + return runtime.run_test( + npu_kernel, + io_args, + refs=refs, + rtol=rtol, + atol=atol, + max_mismatch_percentage=max_mismatch_percentage, + min_correlation=min_correlation, + stochastic_refs=stochastic_expected_outputs, + ) + + def _run_with_trace( + self, + npu_kernel, + inputs, + expected_outputs, + stochastic_expected_outputs, + rtol, + atol, + max_mismatch_percentage, + trace_file, + ) -> int: + """Handle the trace-enabled execution path.""" + try: + from aie.utils import TraceConfig, HostRuntime + except ImportError: + raise AirBackendError( + "Trace utilities (aie.utils) are not available. " + "Install mlir-aie to use trace_size parameter." + ) + + runtime = get_air_runtime() + + # Build combined tensors for trace path + if expected_outputs: + total_bytes = expected_outputs[0].nbytes + self.trace_size + first_out = np.zeros(total_bytes, dtype=np.uint8) + rest_outs = [np.zeros(o.shape, o.dtype) for o in expected_outputs[1:]] + output_placeholders = [first_out] + rest_outs + expected_outputs_0_shape = expected_outputs[0].shape + expected_outputs_0_dtype = expected_outputs[0].dtype + elif stochastic_expected_outputs: + first_output_elements = np.prod(stochastic_expected_outputs[0]["shape"]) + first_output_bytes = ( + first_output_elements + * stochastic_expected_outputs[0]["values"][0].dtype.itemsize + ) + total_bytes = first_output_bytes + self.trace_size + first_out = np.zeros(total_bytes, dtype=np.uint8) + rest_outs = [ + np.zeros(o["shape"], o["values"][0].dtype) + for o in stochastic_expected_outputs[1:] + ] + output_placeholders = [first_out] + rest_outs + expected_outputs_0_shape = stochastic_expected_outputs[0]["shape"] + expected_outputs_0_dtype = stochastic_expected_outputs[0]["values"].dtype else: - # Case 4: No trace, original behavior - if expected_outputs: - output_placeholders = [ - np.zeros(o.shape, o.dtype) for o in expected_outputs - ] - elif stochastic_expected_outputs: - output_placeholders = [ - np.zeros(o["shape"], o["values"][0].dtype) - for o in stochastic_expected_outputs - ] - else: - assert ( - False - ), f"Expect one of 'expected_outputs' and 'stochastic_expected_outputs' to not be empty, or trace_size > 0." + trace_only_output = np.zeros(self.trace_size, dtype=np.uint8) + output_placeholders = [trace_only_output] + expected_outputs_0_shape = None + expected_outputs_0_dtype = None - expanded_inputs = inputs + output_placeholders + all_np = inputs + output_placeholders + io_args = [_tensor(a) if _has_aie_utils else a for a in all_np] - compiled_module = backend.compile(mlir_module) + handle = runtime.load(npu_kernel) with filelock.FileLock("/tmp/npu.lock"): - module_function = backend.load(compiled_module) - actual_outputs = module_function(*expanded_inputs) - - backend.unload() + runtime.run(handle, io_args) - # Remove input slots from the received outputs first - actual_outputs = list(actual_outputs[len(inputs) :]) + # Extract numpy results + actual_outputs_np = [t.numpy() for t in io_args[len(inputs) :]] - # Handle trace data extraction and saving - if self.trace_size > 0: - # Import trace utilities only when needed for trace handling - try: - from aie.utils import TraceConfig, HostRuntime - except ImportError: - raise AirBackendError( - "Trace utilities (aie.utils) are not available. " - "Trace functionality requires mlir-aie to be installed. " - "Install mlir-aie to use trace_size parameter." - ) - - actual_outputs[0], trace = HostRuntime._extract_prefix( - actual_outputs[0], + # Extract trace data + if expected_outputs_0_shape is not None: + actual_outputs_np[0], trace = HostRuntime._extract_prefix( + actual_outputs_np[0], expected_outputs_0_shape, - expected_outputs_0_dtype, + np.dtype(expected_outputs_0_dtype), ) - trace = trace.view(np.uint32).reshape(self.trace_size // 4) - trace_config = TraceConfig( - trace_size=self.trace_size, trace_file=active_trace_file - ) - trace_config.write_trace(trace) + else: + trace = actual_outputs_np[0].view(np.uint8) - print(f"Trace data ({self.trace_size} bytes) saved to {active_trace_file}") + trace = trace.view(np.uint32).reshape(self.trace_size // 4) + trace_config = TraceConfig(trace_size=self.trace_size, trace_file=trace_file) + trace_config.write_trace(trace) + print(f"Trace data ({self.trace_size} bytes) saved to {trace_file}") - # Perform result checking only if we have expected outputs - if expected_outputs and actual_outputs: - if self._check_outputs( - actual_outputs=actual_outputs, - expected_outputs=expected_outputs, + # Verify results — wrap numpy arrays as lightweight objects with .numpy() + class _NumpyWrap: + def __init__(self, arr): + self._arr = arr + + def numpy(self): + return self._arr + + wrapped = [_NumpyWrap(a) for a in actual_outputs_np] + + if expected_outputs and actual_outputs_np: + refs = {i: exp for i, exp in enumerate(expected_outputs)} + errors = AirRuntime.verify_results( + wrapped, + refs=refs, rtol=rtol, atol=atol, max_mismatch_percentage=max_mismatch_percentage, - min_correlation=min_correlation, - ): + ) + if errors == 0: print("PASS!") - return_code = 0 + return 0 else: print("failed.") - return_code = -1 - elif stochastic_expected_outputs and actual_outputs: - if self._check_outputs_stochastic( - actual_outputs=actual_outputs, - stochastic_expected_outputs=stochastic_expected_outputs, + return -1 + elif stochastic_expected_outputs and actual_outputs_np: + errors = AirRuntime.verify_results( + wrapped, + refs={}, rtol=rtol, atol=atol, max_mismatch_percentage=max_mismatch_percentage, - ): + stochastic_refs=stochastic_expected_outputs, + ) + if errors == 0: print("PASS!") - return_code = 0 + return 0 else: print("failed.") - return_code = -1 - elif self.trace_size > 0 and not ( - expected_outputs or stochastic_expected_outputs - ): - # Trace-only case - print("Trace data extracted successfully!") - return_code = 0 + return -1 else: - print("No outputs to validate.") - return_code = 0 - - return return_code - - def _check_outputs( - self, - actual_outputs: List[np.ndarray], - expected_outputs: List[np.ndarray], - rtol: float = 1e-3, - atol: float = 1e-8, - max_mismatch_percentage: float = 0, - min_correlation: float = None, - ): - assert len(actual_outputs) == len( - expected_outputs - ), f"Number of actual outputs ({len(actual_outputs)}) does not equal number of expected outputs ({len(expected_outputs)})" - np.set_printoptions(formatter={"int": hex}) - - for i, (actual, expected) in enumerate(zip(actual_outputs, expected_outputs)): - actual = np.reshape(actual, expected.shape) - - if self.verbose: - print("Expected: ") - if len(expected.shape) == 2: - print(np.asmatrix(expected)) - else: - print(expected) - print("Actual: ") - if len(actual.shape) == 2: - print(np.asmatrix(actual)) - else: - print(actual) - - if expected.dtype in [np.float16, np.float32, np.float64, bfloat16]: - if expected.dtype == bfloat16: - expected = expected.astype(np.float64) - actual = actual.astype(np.float64) - - # Element-wise tolerance check - elementwise_ok = True - close_mask = np.isclose(actual, expected, rtol=rtol, atol=atol) - mismatch_indices = np.where(~close_mask) - num_mismatches = len(mismatch_indices[0]) - total_elements = expected.size - max_acceptable = int(total_elements * max_mismatch_percentage / 100) - if num_mismatches > max_acceptable: - elementwise_ok = False - print(f"ERROR: Output {i} does not meet expected output.") - print(f"Shape: {expected.shape}") - if total_elements > 0: - print( - f"Mismatches: {num_mismatches} / {total_elements} elements ({100*num_mismatches/total_elements:.2f}%)" - ) - else: - print( - f"Mismatches: {num_mismatches} / {total_elements} elements (empty array)" - ) - if max_acceptable > 0: - print( - f"Max acceptable: {max_acceptable} ({max_mismatch_percentage}%)" - ) - # Show first N mismatches - max_display = 20 - print( - f"First {min(max_display, num_mismatches)} mismatched locations:" - ) - for j in range(min(max_display, num_mismatches)): - idx = tuple(dim[j] for dim in mismatch_indices) - print( - f" Index {idx}: expected={expected[idx]}, actual={actual[idx]}, diff={abs(actual[idx] - expected[idx])}" - ) - if num_mismatches > max_display: - print( - f" ... and {num_mismatches - max_display} more mismatches" - ) - - # Correlation check (parallel with element-wise) - corr_ok = True - if min_correlation is not None and total_elements > 0: - corr = float( - np.corrcoef(actual.flatten(), expected.flatten())[0, 1] - ) - print( - f"Output {i} correlation: {corr:.6f} " - f"(threshold: {min_correlation})" - ) - if not np.isfinite(corr) or corr < min_correlation: - corr_ok = False - print( - f"ERROR: Output {i} correlation {corr:.6f} " - f"below threshold {min_correlation}" - ) - - if not elementwise_ok or not corr_ok: - return False - else: - if not np.array_equal(actual, expected): - print(f"ERROR: Output {i} does not meet expected output.") - # Find mismatched elements - mismatch_mask = actual != expected - mismatch_indices = np.where(mismatch_mask) - num_mismatches = len(mismatch_indices[0]) - total_elements = expected.size - print(f"Shape: {expected.shape}") - if total_elements > 0: - print( - f"Mismatches: {num_mismatches} / {total_elements} elements ({100*num_mismatches/total_elements:.2f}%)" - ) - else: - print( - f"Mismatches: {num_mismatches} / {total_elements} elements (empty array)" - ) - # Show first N mismatches - max_display = 20 - print( - f"First {min(max_display, num_mismatches)} mismatched locations:" - ) - for j in range(min(max_display, num_mismatches)): - idx = tuple(dim[j] for dim in mismatch_indices) - print( - f" Index {idx}: expected={expected[idx]}, actual={actual[idx]}" - ) - if num_mismatches > max_display: - print( - f" ... and {num_mismatches - max_display} more mismatches" - ) - return False - - return True - - def _check_outputs_stochastic( - self, - actual_outputs: List[np.ndarray], - stochastic_expected_outputs: List[np.ndarray], - rtol: float = 1e-3, - atol: float = 1e-8, - max_mismatch_percentage: float = 0, - ): - assert len(actual_outputs) == len( - stochastic_expected_outputs - ), f"Number of actual outputs ({len(actual_outputs)}) does not equal number of expected outputs ({len(stochastic_expected_outputs)})" - np.set_printoptions(formatter={"int": hex}) - - for i, (actual, expected) in enumerate( - zip(actual_outputs, stochastic_expected_outputs) - ): - actual = np.reshape(actual, expected["shape"]) - - if self.verbose: - print("Expected: ") - if len(expected["shape"]) == 2: - print(np.asmatrix(expected)) - else: - print("Shape: ", expected["shape"]) - print("Indices: ", expected["indices"]) - print("Values: ", expected["values"]) - print("Actual: ") - if len(actual.shape) == 2: - print(np.asmatrix(actual)) - else: - print(actual) - - if expected["values"][0].dtype in [ - np.float16, - np.float32, - np.float64, - bfloat16, - ]: - if expected["values"][0].dtype == bfloat16: - expected["values"] = expected["values"].astype(np.float64) - actual = actual.astype(np.float64) - actual_stochastic = actual[tuple(expected["indices"])] - close_mask = np.isclose( - actual_stochastic, expected["values"], rtol=rtol, atol=atol - ) - mismatch_positions = np.where(~close_mask)[0] - num_mismatches = len(mismatch_positions) - total_elements = len(expected["values"]) - max_acceptable = int(total_elements * max_mismatch_percentage / 100) - if num_mismatches > max_acceptable: - print(f"ERROR: Output {i} does not meet expected output.") - print(f"Shape: {expected['shape']}") - print(f"Stochastic check: {total_elements} sampled elements") - print( - f"Mismatches: {num_mismatches} / {total_elements} elements ({100*num_mismatches/total_elements:.2f}%)" - ) - if max_acceptable > 0: - print( - f"Max acceptable: {max_acceptable} ({max_mismatch_percentage}%)" - ) - # Show first N mismatches - max_display = 20 - print( - f"First {min(max_display, num_mismatches)} mismatched locations:" - ) - for j in range(min(max_display, num_mismatches)): - pos = mismatch_positions[j] - idx = tuple(dim[pos] for dim in expected["indices"]) - exp_val = expected["values"][pos] - act_val = actual_stochastic[pos] - print( - f" Index {idx}: expected={exp_val}, actual={act_val}, diff={abs(act_val - exp_val)}" - ) - if num_mismatches > max_display: - print( - f" ... and {num_mismatches - max_display} more mismatches" - ) - return False - else: - actual_stochastic = actual[tuple(expected["indices"])] - if not np.array_equal(actual_stochastic, expected["values"]): - print(f"ERROR: Output {i} does not meet expected output.") - # Find mismatched elements - mismatch_mask = actual_stochastic != expected["values"] - mismatch_positions = np.where(mismatch_mask)[0] - num_mismatches = len(mismatch_positions) - total_elements = len(expected["values"]) - print(f"Shape: {expected['shape']}") - print(f"Stochastic check: {total_elements} sampled elements") - if total_elements > 0: - print( - f"Mismatches: {num_mismatches} / {total_elements} elements ({100*num_mismatches/total_elements:.2f}%)" - ) - else: - print( - f"Mismatches: {num_mismatches} / {total_elements} elements (empty array)" - ) - # Show first N mismatches - max_display = 20 - print( - f"First {min(max_display, num_mismatches)} mismatched locations:" - ) - for j in range(min(max_display, num_mismatches)): - pos = mismatch_positions[j] - idx = tuple(dim[pos] for dim in expected["indices"]) - exp_val = expected["values"][pos] - act_val = actual_stochastic[pos] - print(f" Index {idx}: expected={exp_val}, actual={act_val}") - if num_mismatches > max_display: - print( - f" ... and {num_mismatches - max_display} more mismatches" - ) - return False - - return True + print("Trace data extracted successfully!") + return 0 diff --git a/python/air/dialects/_air_ops_ext.py b/python/air/dialects/_air_ops_ext.py index 0673a7bda..c6c0dedf7 100644 --- a/python/air/dialects/_air_ops_ext.py +++ b/python/air/dialects/_air_ops_ext.py @@ -15,6 +15,8 @@ from ..extras import types as T from .func import FuncOp, CallOp +from ._air_enum_gen import MemorySpace as _MemorySpace +from .affine import apply as _affine_apply def pyint_to_index(i): @@ -320,6 +322,62 @@ def module_builder_wrapper(*args, **kwargs): segment = region_op(Segment, terminator=lambda *_args: SegmentTerminatorOp()) +def l1_memref_type(shape, element_type): + """Create a MemRef type in L1 (per-core scratchpad) memory space.""" + return MemRefType.get( + shape, element_type, + memory_space=IntegerAttr.get(T.i32(), _MemorySpace.L1), + ) + + +def l2_memref_type(shape, element_type): + """Create a MemRef type in L2 (segment-shared) memory space.""" + return MemRefType.get( + shape, element_type, + memory_space=IntegerAttr.get(T.i32(), _MemorySpace.L2), + ) + + +def vec_type(size, element_type): + """Create a 1D VectorType of given length and element type.""" + return VectorType.get([size], element_type) + + +def identity_map_attr(): + """Return a 1D identity AffineMapAttr (the standard transfer_read/write map).""" + return AffineMapAttr.get(AffineMap.get_identity(1)) + + +def tile_offset_1d(loop_var, tile_idx, tile_n): + """ + Compute the 1D strided-tile offset: loop_var + tile_idx * tile_n. + + Replaces the 12-line AffineMap.get / AffineExpr.get_add / AffineExpr.get_mul + / AffineSymbolExpr.get / AffineConstantExpr.get / affine_apply block used + in every 1D vectorized example with a 1x2 herd. + + Args: + loop_var: outer loop induction variable (SSA Value) + tile_idx: herd tile index, e.g. _ty (SSA Value) + tile_n: tile size in elements (Python int) + Returns: + SSA Value holding the computed index. + """ + offset_map = AffineMap.get( + 0, 2, + [ + AffineExpr.get_add( + AffineSymbolExpr.get(0), + AffineExpr.get_mul( + AffineSymbolExpr.get(1), + AffineConstantExpr.get(tile_n), + ), + ) + ], + ) + return _affine_apply(offset_map, [loop_var, tile_idx]) + + def external_func(name, inputs, outputs=None, visibility="private"): if outputs is None: outputs = [] diff --git a/test/xrt/01_air_to_npu/gen.py b/test/xrt/01_air_to_npu/gen.py index 5ba44c0a9..87814e76d 100644 --- a/test/xrt/01_air_to_npu/gen.py +++ b/test/xrt/01_air_to_npu/gen.py @@ -9,11 +9,12 @@ from air.compiler.util import run_transform import argparse -from air.backend.xrt import XRTBackend -from air.backend.xrt_runner import XRTRunner +from air.backend.xrt import compile_air, get_air_runtime from air.ir import * import air.passmanager +import aie.utils +import filelock import numpy as np np.random.seed(42) @@ -156,19 +157,18 @@ def forward(lhs, rhs): # Matrix B: (256, 128) B = np.random.randint(-10, 10, size=(256, 128), dtype=np.int32) C = np.matmul(A, B) -runner = XRTRunner( +npu_kernel = compile_air( + air_module, air_loop_fusion=True, omit_while_true_loop=False, use_lock_race_condition_fix=True, trace_offset=opts.trace_offset, trace_size=opts.trace_size, - trace_file=opts.trace_file, runtime_loop_tiling_sizes=[4, 4], ) -exit( - runner.run_test( - air_module, - inputs=[A, B], - expected_outputs=[C], - ) -) +runtime = get_air_runtime() +io_args = [aie.utils.tensor(A), aie.utils.tensor(B)] + [ + aie.utils.tensor(np.zeros(C.shape, C.dtype)) +] +refs = {2: C} +exit(runtime.run_test(npu_kernel, io_args, refs=refs, trace_file=opts.trace_file)) diff --git a/test/xrt/02_mul_shim_1x1/run.py b/test/xrt/02_mul_shim_1x1/run.py index 404d6f92c..017311848 100644 --- a/test/xrt/02_mul_shim_1x1/run.py +++ b/test/xrt/02_mul_shim_1x1/run.py @@ -3,7 +3,7 @@ # RUN: %PYTHON %s | FileCheck %s -import air.backend.xrt as xrt_backend +from air.backend.xrt import compile_air, get_air_runtime from air.dialects.air import * from air.dialects.func import FuncOp import air.dialects.linalg.opdsl.lang as linalg_lang @@ -12,6 +12,7 @@ from air.ir import * import argparse +import aie.utils import numpy as np np.random.seed(42) @@ -113,18 +114,23 @@ def run_test(size, idtype, odtype): ref = (input_a * input_b).astype(odtype) input_c = np.ones_like(ref) - backend = xrt_backend.XRTBackend( + npu_kernel = compile_air( + mlir_module, verbose=verbose, use_lock_race_condition_fix=True, runtime_loop_tiling_sizes=[4, 4], ) - # run the module - compiled_module = backend.compile(mlir_module) + runtime = get_air_runtime() + io_args = [ + aie.utils.tensor(input_a), + aie.utils.tensor(input_b), + aie.utils.tensor(input_c), + ] with filelock.FileLock("/tmp/npu.lock"): - mul = backend.load(compiled_module) - _, _, output_c = mul(input_a, input_b, input_c) - backend.unload() + handle = runtime.load(npu_kernel) + runtime.run(handle, io_args) + output_c = io_args[2].numpy() print("inputA:", input_a) print("inputB:", input_b) diff --git a/test/xrt/03_mul_L1L2_1x1/run.py b/test/xrt/03_mul_L1L2_1x1/run.py index 57e3ca46b..5f4d511f4 100644 --- a/test/xrt/03_mul_L1L2_1x1/run.py +++ b/test/xrt/03_mul_L1L2_1x1/run.py @@ -3,7 +3,7 @@ # RUN: %PYTHON %s | FileCheck %s -import air.backend.xrt as xrt_backend +from air.backend.xrt import compile_air, get_air_runtime from air.dialects.air import * from air.dialects.func import FuncOp import air.dialects.linalg.opdsl.lang as linalg_lang @@ -12,6 +12,7 @@ from air.ir import * import argparse +import aie.utils import numpy as np np.random.seed(42) @@ -151,7 +152,8 @@ def run_test(size, idtype, odtype): ref = (input_a * input_b).astype(odtype) input_c = np.ones_like(ref) - backend = xrt_backend.XRTBackend( + npu_kernel = compile_air( + mlir_module, verbose=verbose, use_lock_race_condition_fix=True, output_format=args.output_format, @@ -159,12 +161,16 @@ def run_test(size, idtype, odtype): runtime_loop_tiling_sizes=[4, 4], ) - # run the module - compiled_module = backend.compile(mlir_module) + runtime = get_air_runtime() + io_args = [ + aie.utils.tensor(input_a), + aie.utils.tensor(input_b), + aie.utils.tensor(input_c), + ] with filelock.FileLock("/tmp/npu.lock"): - mul = backend.load(compiled_module) - _, _, output_c = mul(input_a, input_b, input_c) - backend.unload() + handle = runtime.load(npu_kernel) + runtime.run(handle, io_args) + output_c = io_args[2].numpy() print("inputA:", input_a) print("inputB:", input_b) diff --git a/test/xrt/04_gemm_w_pack/gen.py b/test/xrt/04_gemm_w_pack/gen.py index 277574d2c..96b6700f9 100644 --- a/test/xrt/04_gemm_w_pack/gen.py +++ b/test/xrt/04_gemm_w_pack/gen.py @@ -3,7 +3,7 @@ # Copyright (C) 2025, Advanced Micro Devices, Inc. All rights reserved. # SPDX-License-Identifier: MIT -from air.backend.xrt import XRTBackend +from air.backend.xrt import compile_air from air.ir import * import air.passmanager @@ -105,8 +105,8 @@ # Run compile and load ############################################### - backend = XRTBackend( + compile_air( + air_module, use_lock_race_condition_fix=True, runtime_loop_tiling_sizes=[4, 4], ) - backend.compile(air_module) diff --git a/test/xrt/06_add_shim_bf16/gen.py b/test/xrt/06_add_shim_bf16/gen.py index c5979c3f8..7d6c6dafc 100644 --- a/test/xrt/06_add_shim_bf16/gen.py +++ b/test/xrt/06_add_shim_bf16/gen.py @@ -3,7 +3,7 @@ # Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved. # SPDX-License-Identifier: MIT -from air.backend.xrt import XRTBackend +from air.backend.xrt import compile_air from air.ir import * from air.passmanager import * from air.dialects.air import module_builder @@ -95,9 +95,9 @@ def mul(lhs, rhs, out): # Run compile and load ############################################### -backend = XRTBackend( +compile_air( + module, air_loop_fusion=True, use_lock_race_condition_fix=True, runtime_loop_tiling_sizes=[4, 4], ) -module_function = backend.compile(module) diff --git a/test/xrt/07_extern_linalg/gen.py b/test/xrt/07_extern_linalg/gen.py index 843c45259..5c7233965 100644 --- a/test/xrt/07_extern_linalg/gen.py +++ b/test/xrt/07_extern_linalg/gen.py @@ -3,7 +3,7 @@ # Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved. # SPDX-License-Identifier: MIT -from air.backend.xrt import XRTBackend +from air.backend.xrt import compile_air from air.ir import * from air.passmanager import * from air.dialects.air import module_builder @@ -100,10 +100,10 @@ def add(lhs, rhs, out): # Run compile and load ############################################### -backend = XRTBackend( +compile_air( + module, lower_linalg_to_func="kernel.o", omit_pingpong=True, use_lock_race_condition_fix=True, runtime_loop_tiling_sizes=[4, 4], ) -module_function = backend.compile(module) diff --git a/test/xrt/08_gemm_extern_vec/gen.py b/test/xrt/08_gemm_extern_vec/gen.py index b7221169f..fbbdcd7df 100644 --- a/test/xrt/08_gemm_extern_vec/gen.py +++ b/test/xrt/08_gemm_extern_vec/gen.py @@ -3,7 +3,7 @@ # Copyright (C) 2025, Advanced Micro Devices, Inc. All rights reserved. # SPDX-License-Identifier: MIT -from air.backend.xrt import XRTBackend +from air.backend.xrt import compile_air from air.ir import * import air.passmanager @@ -123,9 +123,9 @@ # Run compile and load ############################################### - backend = XRTBackend( + compile_air( + air_module, air_loop_fusion=True, use_lock_race_condition_fix=True, runtime_loop_tiling_sizes=[4, 4], ) - backend.compile(air_module) diff --git a/test/xrt/09_gemm_extern_vec_4x4/gen.py b/test/xrt/09_gemm_extern_vec_4x4/gen.py index 846c8901f..4ab683cd6 100644 --- a/test/xrt/09_gemm_extern_vec_4x4/gen.py +++ b/test/xrt/09_gemm_extern_vec_4x4/gen.py @@ -3,7 +3,7 @@ # Copyright (C) 2025, Advanced Micro Devices, Inc. All rights reserved. # SPDX-License-Identifier: MIT -from air.backend.xrt import XRTBackend +from air.backend.xrt import compile_air from air.ir import * import air.passmanager @@ -124,9 +124,9 @@ # Run compile and load ############################################### - backend = XRTBackend( + compile_air( + air_module, air_loop_fusion=True, use_lock_race_condition_fix=True, runtime_loop_tiling_sizes=[4, 4], ) - backend.compile(air_module) diff --git a/test/xrt/10_gemm_peeling_extern_vec/gen.py b/test/xrt/10_gemm_peeling_extern_vec/gen.py index efe11e766..a5b99786f 100644 --- a/test/xrt/10_gemm_peeling_extern_vec/gen.py +++ b/test/xrt/10_gemm_peeling_extern_vec/gen.py @@ -3,7 +3,7 @@ # Copyright (C) 2025, Advanced Micro Devices, Inc. All rights reserved. # SPDX-License-Identifier: MIT -from air.backend.xrt import XRTBackend +from air.backend.xrt import compile_air from air.ir import * import air.passmanager @@ -136,8 +136,8 @@ # Run compile and load ############################################### - backend = XRTBackend( + compile_air( + air_module, use_lock_race_condition_fix=True, runtime_loop_tiling_sizes=[4, 4], ) - backend.compile(air_module) diff --git a/test/xrt/11_gemm_bias_fusion/gen.py b/test/xrt/11_gemm_bias_fusion/gen.py index 880346c49..1bddee696 100644 --- a/test/xrt/11_gemm_bias_fusion/gen.py +++ b/test/xrt/11_gemm_bias_fusion/gen.py @@ -3,7 +3,7 @@ # Copyright (C) 2025, Advanced Micro Devices, Inc. All rights reserved. # SPDX-License-Identifier: MIT -from air.backend.xrt import XRTBackend +from air.backend.xrt import compile_air from air.ir import * import air.passmanager @@ -187,10 +187,12 @@ # Run compile and load ############################################### - backend = XRTBackend( + compile_air( + air_module, air_loop_fusion=True, omit_auto_broadcast=True, - channel_multiplexing=["L1",], + channel_multiplexing=[ + "L1", + ], runtime_loop_tiling_sizes=[2, 2], ) - backend.compile(air_module) diff --git a/test/xrt/12_matmul_transform_1x4_bf16/gen.py b/test/xrt/12_matmul_transform_1x4_bf16/gen.py index 45119bdb7..1163ef67d 100644 --- a/test/xrt/12_matmul_transform_1x4_bf16/gen.py +++ b/test/xrt/12_matmul_transform_1x4_bf16/gen.py @@ -9,7 +9,7 @@ import air.passmanager from air._mlir_libs._air import run_transform from air.dialects.air import module_builder -from air.backend.xrt import XRTBackend +from air.backend.xrt import compile_air import argparse @@ -132,10 +132,10 @@ def forward(lhs, rhs, out): # Run compile and load ############################################### -backend = XRTBackend( +compile_air( + air_module, air_loop_fusion=True, runtime_loop_tiling_sizes=[1, 1], lower_linalg_to_func="kernel.o", use_lock_race_condition_fix=True, ) -backend.compile(air_module) diff --git a/test/xrt/13_conv2d_i32/gen.py b/test/xrt/13_conv2d_i32/gen.py index 7ba7bf0a6..8cfa053a8 100644 --- a/test/xrt/13_conv2d_i32/gen.py +++ b/test/xrt/13_conv2d_i32/gen.py @@ -3,7 +3,7 @@ # Copyright (C) 2025, Advanced Micro Devices, Inc. All rights reserved. # SPDX-License-Identifier: MIT -from air.backend.xrt import XRTBackend +from air.backend.xrt import compile_air from air.ir import * import air.passmanager @@ -100,8 +100,8 @@ # Run compile and load ############################################### - backend = XRTBackend( + compile_air( + air_module, runtime_loop_tiling_sizes=[1, 1], use_lock_race_condition_fix=True, ) - backend.compile(air_module) diff --git a/test/xrt/14_conv2d_i8_extern_vec/gen.py b/test/xrt/14_conv2d_i8_extern_vec/gen.py index eee6004af..8eacf5ec6 100644 --- a/test/xrt/14_conv2d_i8_extern_vec/gen.py +++ b/test/xrt/14_conv2d_i8_extern_vec/gen.py @@ -3,7 +3,7 @@ # Copyright (C) 2025, Advanced Micro Devices, Inc. All rights reserved. # SPDX-License-Identifier: MIT -from air.backend.xrt import XRTBackend +from air.backend.xrt import compile_air from air.ir import * import air.passmanager import argparse @@ -116,11 +116,11 @@ # Run compile and load ############################################### - backend = XRTBackend( + compile_air( + air_module, lower_linalg_to_func="conv.o", trace_offset=opts.trace_offset, trace_size=opts.trace_size, runtime_loop_tiling_sizes=[1, 1], use_lock_race_condition_fix=True, ) - backend.compile(air_module) diff --git a/test/xrt/15_gemm_peeling_extern_vec_4x4_bf16/gen.py b/test/xrt/15_gemm_peeling_extern_vec_4x4_bf16/gen.py index 871d846ca..3c80645ee 100644 --- a/test/xrt/15_gemm_peeling_extern_vec_4x4_bf16/gen.py +++ b/test/xrt/15_gemm_peeling_extern_vec_4x4_bf16/gen.py @@ -3,7 +3,7 @@ # Copyright (C) 2025, Advanced Micro Devices, Inc. All rights reserved. # SPDX-License-Identifier: MIT -from air.backend.xrt import XRTBackend +from air.backend.xrt import compile_air from air.ir import * import air.passmanager @@ -159,9 +159,9 @@ # Run compile and load ############################################### - backend = XRTBackend( + compile_air( + air_module, lower_linalg_to_func="mm.o", use_lock_race_condition_fix=True, runtime_loop_tiling_sizes=[4, 4], ) - backend.compile(air_module) diff --git a/test/xrt/16_gemm_peeling_extern_vec_4x4_bf16_packet/gen.py b/test/xrt/16_gemm_peeling_extern_vec_4x4_bf16_packet/gen.py index 54bd81322..eaab080e4 100644 --- a/test/xrt/16_gemm_peeling_extern_vec_4x4_bf16_packet/gen.py +++ b/test/xrt/16_gemm_peeling_extern_vec_4x4_bf16_packet/gen.py @@ -3,7 +3,7 @@ # Copyright (C) 2025, Advanced Micro Devices, Inc. All rights reserved. # SPDX-License-Identifier: MIT -from air.backend.xrt import XRTBackend +from air.backend.xrt import compile_air from air.ir import * import air.passmanager @@ -159,9 +159,9 @@ # Run compile and load ############################################### -backend = XRTBackend( +compile_air( + air_module, lower_linalg_to_func="mm.o", use_lock_race_condition_fix=True, runtime_loop_tiling_sizes=[4, 4], ) -backend.compile(air_module) diff --git a/test/xrt/17_gemm_8x16_transform_vec_4x4/gen.py b/test/xrt/17_gemm_8x16_transform_vec_4x4/gen.py index f3157db7e..9082b6b03 100644 --- a/test/xrt/17_gemm_8x16_transform_vec_4x4/gen.py +++ b/test/xrt/17_gemm_8x16_transform_vec_4x4/gen.py @@ -3,7 +3,7 @@ # Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved. # SPDX-License-Identifier: MIT -from air.backend.xrt import XRTBackend +from air.backend.xrt import compile_air from air.ir import * import air.passmanager @@ -135,10 +135,10 @@ # Run compile and load ############################################### -backend = XRTBackend( +compile_air( + air_module, air_loop_fusion=True, lower_linalg_to_func="mm.o", runtime_loop_tiling_sizes=[1, 1], use_lock_race_condition_fix=True, ) -backend.compile(air_module) diff --git a/test/xrt/18_matmul_8x16_shim_transform_bf16/gen.py b/test/xrt/18_matmul_8x16_shim_transform_bf16/gen.py index 16b7a6e51..09d22e61d 100644 --- a/test/xrt/18_matmul_8x16_shim_transform_bf16/gen.py +++ b/test/xrt/18_matmul_8x16_shim_transform_bf16/gen.py @@ -1,11 +1,10 @@ # Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved. # SPDX-License-Identifier: MIT -from air.backend.xrt import XRTBackend +from air.backend.xrt import compile_air from air.ir import * import air.passmanager - ################################################ ## Input SCF and Linalg IR ################################################ @@ -111,9 +110,9 @@ # Run compile and load ############################################### -backend = XRTBackend( +compile_air( + air_module, air_loop_fusion=True, lower_linalg_to_func="kernel.o", runtime_loop_tiling_sizes=[2, 2], ) -backend.compile(air_module) diff --git a/test/xrt/19_matmul_8x16_core_transform_bf16/gen.py b/test/xrt/19_matmul_8x16_core_transform_bf16/gen.py index 790927e04..b1205fd6d 100644 --- a/test/xrt/19_matmul_8x16_core_transform_bf16/gen.py +++ b/test/xrt/19_matmul_8x16_core_transform_bf16/gen.py @@ -1,11 +1,10 @@ # Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved. # SPDX-License-Identifier: MIT -from air.backend.xrt import XRTBackend +from air.backend.xrt import compile_air from air.ir import * import air.passmanager - ################################################ ## Input SCF and Linalg IR ################################################ @@ -109,9 +108,9 @@ # Run compile and load ############################################### -backend = XRTBackend( +compile_air( + air_module, air_loop_fusion=True, lower_linalg_to_func="kernel.o", runtime_loop_tiling_sizes=[4, 4], ) -backend.compile(air_module) diff --git a/test/xrt/20_batch_matmul_i32/gen.py b/test/xrt/20_batch_matmul_i32/gen.py index 566c53e3a..01394299c 100644 --- a/test/xrt/20_batch_matmul_i32/gen.py +++ b/test/xrt/20_batch_matmul_i32/gen.py @@ -3,7 +3,7 @@ # Copyright (C) 2025, Advanced Micro Devices, Inc. All rights reserved. # SPDX-License-Identifier: MIT -from air.backend.xrt import XRTBackend +from air.backend.xrt import compile_air from air.ir import * import air.passmanager @@ -159,8 +159,8 @@ # Run compile and load ############################################### - backend = XRTBackend( + compile_air( + air_module, air_loop_fusion=True, runtime_loop_tiling_sizes=[8, 1], # Note: [4, 4] gives numeric error. Why? ) - backend.compile(air_module) diff --git a/test/xrt/21_conv2d_depthwise_i32/gen.py b/test/xrt/21_conv2d_depthwise_i32/gen.py index 0b0b178db..91f9b7b99 100644 --- a/test/xrt/21_conv2d_depthwise_i32/gen.py +++ b/test/xrt/21_conv2d_depthwise_i32/gen.py @@ -3,7 +3,7 @@ # Copyright (C) 2025, Advanced Micro Devices, Inc. All rights reserved. # SPDX-License-Identifier: MIT -from air.backend.xrt import XRTBackend +from air.backend.xrt import compile_air from air.ir import * import air.passmanager @@ -96,8 +96,8 @@ # Run compile and load ############################################### - backend = XRTBackend( + compile_air( + air_module, air_loop_fusion=True, runtime_loop_tiling_sizes=[2, 4], ) - backend.compile(air_module) diff --git a/test/xrt/22_conv2d_stride2_i32/gen.py b/test/xrt/22_conv2d_stride2_i32/gen.py index 4dc4deab7..c4ec60091 100644 --- a/test/xrt/22_conv2d_stride2_i32/gen.py +++ b/test/xrt/22_conv2d_stride2_i32/gen.py @@ -3,7 +3,7 @@ # Copyright (C) 2025, Advanced Micro Devices, Inc. All rights reserved. # SPDX-License-Identifier: MIT -from air.backend.xrt import XRTBackend +from air.backend.xrt import compile_air from air.ir import * import air.passmanager @@ -101,8 +101,8 @@ # Run compile and load ############################################### - backend = XRTBackend( + compile_air( + air_module, runtime_loop_tiling_sizes=[1, 1], use_lock_race_condition_fix=True, ) - backend.compile(air_module) diff --git a/test/xrt/25_batch_matmul_bf16/gen.py b/test/xrt/25_batch_matmul_bf16/gen.py index a6032fcdd..ed60bd207 100644 --- a/test/xrt/25_batch_matmul_bf16/gen.py +++ b/test/xrt/25_batch_matmul_bf16/gen.py @@ -3,7 +3,7 @@ # Copyright (C) 2025, Advanced Micro Devices, Inc. All rights reserved. # SPDX-License-Identifier: MIT -from air.backend.xrt import XRTBackend +from air.backend.xrt import compile_air from air.ir import * import air.passmanager @@ -159,9 +159,9 @@ # Run compile and load ############################################### - backend = XRTBackend( + compile_air( + air_module, lower_linalg_to_func="mm.o", air_loop_fusion=True, runtime_loop_tiling_sizes=[1, 1], ) - backend.compile(air_module) diff --git a/test/xrt/26_vecmat_i8/gen.py b/test/xrt/26_vecmat_i8/gen.py index 0b1d3514b..ef0951a9f 100644 --- a/test/xrt/26_vecmat_i8/gen.py +++ b/test/xrt/26_vecmat_i8/gen.py @@ -3,7 +3,7 @@ # Copyright (C) 2025, Advanced Micro Devices, Inc. All rights reserved. # SPDX-License-Identifier: MIT -from air.backend.xrt import XRTBackend +from air.backend.xrt import compile_air from air.ir import * import air.passmanager @@ -120,9 +120,9 @@ # Run compile and load ############################################### - backend = XRTBackend( + compile_air( + air_module, lower_linalg_to_func="vm.o", use_lock_race_condition_fix=True, runtime_loop_tiling_sizes=[4, 4], ) - backend.compile(air_module) diff --git a/test/xrt/27_gemm_peeling_extern_vec_4x4_i32/gen.py b/test/xrt/27_gemm_peeling_extern_vec_4x4_i32/gen.py index 582e9066e..bd515b928 100644 --- a/test/xrt/27_gemm_peeling_extern_vec_4x4_i32/gen.py +++ b/test/xrt/27_gemm_peeling_extern_vec_4x4_i32/gen.py @@ -3,7 +3,7 @@ # Copyright (C) 2025, Advanced Micro Devices, Inc. All rights reserved. # SPDX-License-Identifier: MIT -from air.backend.xrt import XRTBackend +from air.backend.xrt import compile_air from air.ir import * import air.passmanager @@ -160,8 +160,8 @@ # Run compile and load ############################################### - backend = XRTBackend( + compile_air( + air_module, runtime_loop_tiling_sizes=[2, 2], use_lock_race_condition_fix=True, ) - backend.compile(air_module) diff --git a/test/xrt/28_gemm_loop_nest_bf16/gen.py b/test/xrt/28_gemm_loop_nest_bf16/gen.py index 065e4c52c..461803ee9 100644 --- a/test/xrt/28_gemm_loop_nest_bf16/gen.py +++ b/test/xrt/28_gemm_loop_nest_bf16/gen.py @@ -3,7 +3,7 @@ # Copyright (C) 2025, Advanced Micro Devices, Inc. All rights reserved. # SPDX-License-Identifier: MIT -from air.backend.xrt import XRTBackend +from air.backend.xrt import compile_air from air.ir import * import air.passmanager @@ -121,10 +121,10 @@ # Run compile and load ############################################### - backend = XRTBackend( + compile_air( + air_module, omit_pingpong=True, lower_linalg_to_func="mm.o", use_lock_race_condition_fix=True, runtime_loop_tiling_sizes=[4, 4], ) - backend.compile(air_module) diff --git a/test/xrt/29_gemm_4_level_tiling_extern_vec_4x4_bf16/gen.py b/test/xrt/29_gemm_4_level_tiling_extern_vec_4x4_bf16/gen.py index f95f2c5d5..c22a2ce7b 100644 --- a/test/xrt/29_gemm_4_level_tiling_extern_vec_4x4_bf16/gen.py +++ b/test/xrt/29_gemm_4_level_tiling_extern_vec_4x4_bf16/gen.py @@ -3,7 +3,7 @@ # Copyright (C) 2025, Advanced Micro Devices, Inc. All rights reserved. # SPDX-License-Identifier: MIT -from air.backend.xrt import XRTBackend +from air.backend.xrt import compile_air from air.ir import * import air.passmanager @@ -130,9 +130,9 @@ # Run compile and load ############################################### - backend = XRTBackend( + compile_air( + air_module, omit_pingpong=True, lower_linalg_to_func="mm.o", runtime_loop_tiling_sizes=[4, 4], ) - backend.compile(air_module) diff --git a/test/xrt/30_mul_rtp_1x1/run.py b/test/xrt/30_mul_rtp_1x1/run.py index 4356df1d7..bb55c8425 100644 --- a/test/xrt/30_mul_rtp_1x1/run.py +++ b/test/xrt/30_mul_rtp_1x1/run.py @@ -3,7 +3,7 @@ # RUN: %PYTHON %s | FileCheck %s -import air.backend.xrt as xrt_backend +from air.backend.xrt import compile_air, get_air_runtime from air.dialects.air import * from air.dialects.func import FuncOp, ReturnOp import air.dialects.linalg.opdsl.lang as linalg_lang @@ -12,6 +12,7 @@ from air.ir import * import argparse +import aie.utils import numpy as np np.random.seed(42) @@ -115,20 +116,25 @@ def run_test(size, idtype, odtype): ref = (input_a * input_b).astype(odtype) input_c = np.ones_like(ref) - backend = xrt_backend.XRTBackend( + npu_kernel = compile_air( + mlir_module, omit_pingpong=True, verbose=verbose, use_lock_race_condition_fix=True, runtime_loop_tiling_sizes=[4, 4], ) - # run the module + runtime = get_air_runtime() + io_args = [ + aie.utils.tensor(input_a), + aie.utils.tensor(input_b), + aie.utils.tensor(input_c), + ] with filelock.FileLock("/tmp/npu.lock"): - mul = backend.compile_and_load(mlir_module) + handle = runtime.load(npu_kernel) print("running") - _, _, output_c = mul(input_a, input_b, input_c) - - backend.unload() + runtime.run(handle, io_args) + output_c = io_args[2].numpy() print("inputA:", input_a) print("inputB:", input_b) diff --git a/test/xrt/31_triton_blk_ptr_eltwise_mul/run.py b/test/xrt/31_triton_blk_ptr_eltwise_mul/run.py index ff91dad5b..3eb462264 100644 --- a/test/xrt/31_triton_blk_ptr_eltwise_mul/run.py +++ b/test/xrt/31_triton_blk_ptr_eltwise_mul/run.py @@ -4,11 +4,10 @@ # SPDX-License-Identifier: MIT import argparse -from air.backend.xrt import XRTBackend -from air.backend.xrt_runner import XRTRunner +from air.backend.xrt import compile_air, get_air_runtime from air.ir import * import air.passmanager -import filelock +import aie.utils import numpy as np @@ -104,19 +103,20 @@ inputs_b = (np.random.rand(*input_size)).reshape(input_size).astype(input_type) ref = (inputs_a * inputs_b).astype(input_type) - ###### Compile and test - runner = XRTRunner( + npu_kernel = compile_air( + air_module, omit_while_true_loop=False, use_lock_race_condition_fix=True, output_format=args.output_format, instance_name="kernel", runtime_loop_tiling_sizes=[4, 4], ) - exit( - runner.run_test( - air_module, - inputs=[inputs_a, inputs_b], - expected_outputs=[ref], - rtol=1e-3, - ) - ) + + runtime = get_air_runtime() + io_args = [ + aie.utils.tensor(inputs_a), + aie.utils.tensor(inputs_b), + aie.utils.tensor(np.zeros(ref.shape, ref.dtype)), + ] + refs = {2: ref} + exit(runtime.run_test(npu_kernel, io_args, refs=refs, rtol=1e-3)) diff --git a/test/xrt/32_triton_matmul/run.py b/test/xrt/32_triton_matmul/run.py index 15578d5ea..7f1f3e1b1 100644 --- a/test/xrt/32_triton_matmul/run.py +++ b/test/xrt/32_triton_matmul/run.py @@ -3,12 +3,11 @@ # Copyright (C) 2025, Advanced Micro Devices, Inc. All rights reserved. # SPDX-License-Identifier: MIT -from air.backend.xrt import XRTBackend -from air.backend.xrt_runner import XRTRunner +from air.backend.xrt import compile_air, get_air_runtime from air.compiler.util import run_transform from air.ir import * import air.passmanager -import filelock +import aie.utils import numpy as np @@ -121,17 +120,18 @@ B = np.random.rand(K, N).astype(input_type) # Shape [K, N] C = np.matmul(A, B).astype(input_type) # Shape [M, N] - ###### Compile and test - runner = XRTRunner( + npu_kernel = compile_air( + air_module, omit_while_true_loop=False, use_lock_race_condition_fix=True, runtime_loop_tiling_sizes=[4, 4], ) - exit( - runner.run_test( - air_module, - inputs=[A, B], - expected_outputs=[C], - rtol=1e-3, - ) - ) + + runtime = get_air_runtime() + io_args = [ + aie.utils.tensor(A), + aie.utils.tensor(B), + aie.utils.tensor(np.zeros(C.shape, C.dtype)), + ] + refs = {2: C} + exit(runtime.run_test(npu_kernel, io_args, refs=refs, rtol=1e-3)) diff --git a/test/xrt/33_triton_matmul_ver2/run.py b/test/xrt/33_triton_matmul_ver2/run.py index b859be202..dc54545cc 100644 --- a/test/xrt/33_triton_matmul_ver2/run.py +++ b/test/xrt/33_triton_matmul_ver2/run.py @@ -4,12 +4,11 @@ # SPDX-License-Identifier: MIT import argparse -from air.backend.xrt import XRTBackend -from air.backend.xrt_runner import XRTRunner +from air.backend.xrt import compile_air, get_air_runtime from air.compiler.util import run_transform from air.ir import * import air.passmanager -import filelock +import aie.utils import numpy as np @@ -106,19 +105,20 @@ B = np.random.rand(K, N).astype(input_type) # Shape [K, N] C = np.matmul(A, B).astype(input_type) # Shape [M, N] - ###### Compile and test - runner = XRTRunner( + npu_kernel = compile_air( + air_module, omit_while_true_loop=False, use_lock_race_condition_fix=True, runtime_loop_tiling_sizes=[4, 4], output_format=args.output_format, instance_name="bare_matmul", ) - exit( - runner.run_test( - air_module, - inputs=[A, B], - expected_outputs=[C], - rtol=1e-3, - ) - ) + + runtime = get_air_runtime() + io_args = [ + aie.utils.tensor(A), + aie.utils.tensor(B), + aie.utils.tensor(np.zeros(C.shape, C.dtype)), + ] + refs = {2: C} + exit(runtime.run_test(npu_kernel, io_args, refs=refs, rtol=1e-3)) diff --git a/test/xrt/34_cascade_vecadd/run_chess.py b/test/xrt/34_cascade_vecadd/run_chess.py index b5429a096..610f96093 100644 --- a/test/xrt/34_cascade_vecadd/run_chess.py +++ b/test/xrt/34_cascade_vecadd/run_chess.py @@ -13,8 +13,8 @@ import argparse import sys -from air.backend.xrt_runner import XRTRunner -from air.backend.xrt import XRTBackend +from air.backend.xrt import compile_air, get_air_runtime +import aie.utils import numpy as np @@ -97,44 +97,40 @@ ############################################### input_a = np.arange(0, 2048, dtype=np.int32) - if args.compile_mode == "compile-and-run": - num_samples = 100 - sampled_indices = np.vstack([np.random.randint(0, 2048, num_samples)]) - - # Compute reference results for sampled indices - sampled_values = np.array( - [input_a[i] + 4 for i in zip(*sampled_indices)], - dtype=np.int32, - ) - - # Store as a dictionary - sampled_data = { - "shape": (2048), - "indices": sampled_indices, - "values": sampled_values, - } - ###### Compile and test - runner = XRTRunner( - verbose=args.verbose, - omit_while_true_loop=False, - runtime_loop_tiling_sizes=[4, 4], - ) - exit( - runner.run_test( - air_module, - inputs=[input_a], - stochastic_expected_outputs=[sampled_data], - ) - ) - - elif args.compile_mode == "compile-only": - ###### Compile only - backend = XRTBackend( - verbose=args.verbose, - omit_while_true_loop=False, - runtime_loop_tiling_sizes=[4, 4], - ) - module_function = backend.compile(mlir_module) - - backend.unload() + npu_kernel = compile_air( + air_module, + verbose=args.verbose, + omit_while_true_loop=False, + runtime_loop_tiling_sizes=[4, 4], + ) + + if args.compile_mode == "compile-only": + exit(0) + + num_samples = 100 + sampled_indices = np.vstack([np.random.randint(0, 2048, num_samples)]) + + # Compute reference results for sampled indices + sampled_values = np.array( + [input_a[i] + 4 for i in zip(*sampled_indices)], + dtype=np.int32, + ) + + # Store as a dictionary + sampled_data = { + "shape": (2048), + "indices": sampled_indices, + "values": sampled_values, + } + + runtime = get_air_runtime() + dtype = sampled_data["values"].dtype + shape = sampled_data["shape"] + if isinstance(shape, int): + shape = (shape,) + io_args = [ + aie.utils.tensor(input_a), + aie.utils.tensor(np.zeros(shape, dtype)), + ] + exit(runtime.run_test(npu_kernel, io_args, refs={}, stochastic_refs=[sampled_data])) diff --git a/test/xrt/34_cascade_vecadd/run_peano.py b/test/xrt/34_cascade_vecadd/run_peano.py index 3c28274a1..cb015b2df 100644 --- a/test/xrt/34_cascade_vecadd/run_peano.py +++ b/test/xrt/34_cascade_vecadd/run_peano.py @@ -13,8 +13,8 @@ import argparse import sys -from air.backend.xrt_runner import XRTRunner -from air.backend.xrt import XRTBackend +from air.backend.xrt import compile_air, get_air_runtime +import aie.utils import numpy as np @@ -100,44 +100,40 @@ ############################################### input_a = np.arange(0, 2048, dtype=np.int32) - if args.compile_mode == "compile-and-run": - num_samples = 100 - sampled_indices = np.vstack([np.random.randint(0, 2048, num_samples)]) - - # Compute reference results for sampled indices - sampled_values = np.array( - [input_a[i] + 4 for i in zip(*sampled_indices)], - dtype=np.int32, - ) - - # Store as a dictionary - sampled_data = { - "shape": (2048), - "indices": sampled_indices, - "values": sampled_values, - } - ###### Compile and test - runner = XRTRunner( - verbose=args.verbose, - omit_while_true_loop=False, - runtime_loop_tiling_sizes=[4, 4], - ) - exit( - runner.run_test( - air_module, - inputs=[input_a], - stochastic_expected_outputs=[sampled_data], - ) - ) - - elif args.compile_mode == "compile-only": - ###### Compile only - backend = XRTBackend( - verbose=args.verbose, - omit_while_true_loop=False, - runtime_loop_tiling_sizes=[4, 4], - ) - module_function = backend.compile(mlir_module) - - backend.unload() + npu_kernel = compile_air( + air_module, + verbose=args.verbose, + omit_while_true_loop=False, + runtime_loop_tiling_sizes=[4, 4], + ) + + if args.compile_mode == "compile-only": + exit(0) + + num_samples = 100 + sampled_indices = np.vstack([np.random.randint(0, 2048, num_samples)]) + + # Compute reference results for sampled indices + sampled_values = np.array( + [input_a[i] + 4 for i in zip(*sampled_indices)], + dtype=np.int32, + ) + + # Store as a dictionary + sampled_data = { + "shape": (2048), + "indices": sampled_indices, + "values": sampled_values, + } + + runtime = get_air_runtime() + dtype = sampled_data["values"].dtype + shape = sampled_data["shape"] + if isinstance(shape, int): + shape = (shape,) + io_args = [ + aie.utils.tensor(input_a), + aie.utils.tensor(np.zeros(shape, dtype)), + ] + exit(runtime.run_test(npu_kernel, io_args, refs={}, stochastic_refs=[sampled_data])) diff --git a/test/xrt/35_herd_reduce/run.py b/test/xrt/35_herd_reduce/run.py index 4e76bd7ff..e7348a073 100644 --- a/test/xrt/35_herd_reduce/run.py +++ b/test/xrt/35_herd_reduce/run.py @@ -13,8 +13,8 @@ import argparse import sys -from air.backend.xrt_runner import XRTRunner -from air.backend.xrt import XRTBackend +from air.backend.xrt import compile_air, get_air_runtime +import aie.utils import numpy as np @@ -106,48 +106,42 @@ ############################################### input_a = np.arange(0, 2048, dtype=np.int32) - if args.compile_mode == "compile-and-run": - num_samples = 100 - sampled_indices = np.vstack([np.random.randint(0, 2048, num_samples)]) - - # Compute reference results for sampled indices - sampled_values = np.array( - [input_a[i] + 4 for i in zip(*sampled_indices)], - dtype=np.int32, - ) - - # Store as a dictionary - sampled_data = { - "shape": (2048), - "indices": sampled_indices, - "values": sampled_values, - } - ###### Compile and test - runner = XRTRunner( - verbose=args.verbose, - omit_while_true_loop=False, - output_format=args.output_format, - instance_name="scf1", - runtime_loop_tiling_sizes=[4, 4], - ) - exit( - runner.run_test( - air_module, - inputs=[input_a], - stochastic_expected_outputs=[sampled_data], - ) - ) - - elif args.compile_mode == "compile-only": - ###### Compile only - backend = XRTBackend( - verbose=args.verbose, - omit_while_true_loop=False, - output_format=args.output_format, - instance_name="scf1", - runtime_loop_tiling_sizes=[4, 4], - ) - module_function = backend.compile(mlir_module) - - backend.unload() + npu_kernel = compile_air( + air_module, + verbose=args.verbose, + omit_while_true_loop=False, + output_format=args.output_format, + instance_name="scf1", + runtime_loop_tiling_sizes=[4, 4], + ) + + if args.compile_mode == "compile-only": + exit(0) + + num_samples = 100 + sampled_indices = np.vstack([np.random.randint(0, 2048, num_samples)]) + + # Compute reference results for sampled indices + sampled_values = np.array( + [input_a[i] + 4 for i in zip(*sampled_indices)], + dtype=np.int32, + ) + + # Store as a dictionary + sampled_data = { + "shape": (2048), + "indices": sampled_indices, + "values": sampled_values, + } + + runtime = get_air_runtime() + dtype = sampled_data["values"].dtype + shape = sampled_data["shape"] + if isinstance(shape, int): + shape = (shape,) + io_args = [ + aie.utils.tensor(input_a), + aie.utils.tensor(np.zeros(shape, dtype)), + ] + exit(runtime.run_test(npu_kernel, io_args, refs={}, stochastic_refs=[sampled_data])) diff --git a/test/xrt/36_cascade_vecmat_i32/run.py b/test/xrt/36_cascade_vecmat_i32/run.py index b46648f69..5731eb393 100644 --- a/test/xrt/36_cascade_vecmat_i32/run.py +++ b/test/xrt/36_cascade_vecmat_i32/run.py @@ -13,8 +13,12 @@ import argparse import sys -from air.backend.xrt_runner import XRTRunner -from air.backend.xrt import XRTBackend +from air.backend.xrt import compile_air, get_air_runtime +import aie.utils + +import numpy as np + +np.random.seed(42) parser = argparse.ArgumentParser( prog="run.py", @@ -111,8 +115,8 @@ linalg.vecmat ins(%alloc_17, %alloc_18 : memref<32xi32, 2 : i32>, memref<32x32xi32, 2 : i32>) outs(%subview_16 : memref<32xi32, strided<[1]>, 2 : i32>) memref.dealloc %alloc_17 : memref<32xi32, 2 : i32> memref.dealloc %alloc_18 : memref<32x32xi32, 2 : i32> - } - + } + scf.reduce(%alloc_12 : memref<32xi32, 2 : i32>) { ^bb0(%a4: memref<32xi32, 2 : i32>, %a5: memref<32xi32, 2 : i32>): linalg.add ins(%a4, %a5 : memref<32xi32, 2 : i32>, memref<32xi32, 2 : i32>) outs(%a4 : memref<32xi32, 2 : i32>) @@ -166,28 +170,23 @@ K, ) input_b = np.arange(0, K * N, dtype=np.int32).reshape(K, N) - if args.compile_mode == "compile-and-run": - output_c = np.dot(input_a.astype(np.int32), input_b.astype(np.int32)) - runner = XRTRunner( - verbose=args.verbose, - omit_while_true_loop=False, - runtime_loop_tiling_sizes=[4, 4], - ) - exit( - runner.run_test( - air_module, - inputs=[input_a, input_b], - expected_outputs=[output_c], - ) - ) - elif args.compile_mode == "compile-only": - ###### Compile only - backend = XRTBackend( - verbose=args.verbose, - omit_while_true_loop=False, - runtime_loop_tiling_sizes=[4, 4], - ) - module_function = backend.compile(air_module) + npu_kernel = compile_air( + air_module, + verbose=args.verbose, + omit_while_true_loop=False, + runtime_loop_tiling_sizes=[4, 4], + ) + + if args.compile_mode == "compile-only": + exit(0) - backend.unload() + output_c = np.dot(input_a.astype(np.int32), input_b.astype(np.int32)) + runtime = get_air_runtime() + io_args = [ + aie.utils.tensor(input_a), + aie.utils.tensor(input_b), + aie.utils.tensor(np.zeros(output_c.shape, output_c.dtype)), + ] + refs = {2: output_c} + exit(runtime.run_test(npu_kernel, io_args, refs=refs)) diff --git a/test/xrt/37_matmul_transform_4x4_bf16/run.py b/test/xrt/37_matmul_transform_4x4_bf16/run.py index d950a6367..233ec64a9 100644 --- a/test/xrt/37_matmul_transform_4x4_bf16/run.py +++ b/test/xrt/37_matmul_transform_4x4_bf16/run.py @@ -9,12 +9,10 @@ from air.compiler.util import run_transform import argparse -from air.backend.xrt import XRTBackend +from air.backend.xrt import compile_air, get_air_runtime from air.ir import * import air.passmanager - -from air.backend.xrt_runner import XRTRunner -from air.backend.xrt import XRTBackend +import aie.utils from ml_dtypes import bfloat16 import numpy as np @@ -165,59 +163,59 @@ def forward(lhs, rhs): N = args.N input_a = np.arange(0, M * K, dtype=bfloat16).reshape(M, K) input_b = np.arange(0, K * N, dtype=bfloat16).reshape(K, N) -if args.compile_mode == "compile-and-run": - # Stochastically sample num_sample results, and pass to XRTRunner backend for verification. - num_samples = 100 - sampled_indices = np.vstack( - [ - np.random.randint(0, args.M, num_samples), # i indices - np.random.randint(0, args.N, num_samples), # j indices - ] - ) - # Compute reference results for sampled indices - sampled_values = np.array( - [ - np.sum( - (input_a[i, :].astype(np.float32) * input_b[:, j].astype(np.float32)), - dtype=np.float32, - ) - for i, j in zip(*sampled_indices) - ], - dtype=np.float32, - ) +npu_kernel = compile_air( + air_module, + verbose=args.verbose, + omit_while_true_loop=False, + runtime_loop_tiling_sizes=[4, 4], + output_format=args.output_format, + instance_name="forward", +) - # Store as a dictionary - sampled_data = { - "shape": (args.M, args.N), - "indices": sampled_indices, - "values": sampled_values, - } - runner = XRTRunner( - verbose=args.verbose, - omit_while_true_loop=False, - runtime_loop_tiling_sizes=[4, 4], - output_format=args.output_format, - instance_name="forward", - ) - exit( - runner.run_test( - air_module, - inputs=[input_a, input_b], - stochastic_expected_outputs=[sampled_data], - rtol=1e-1, +if args.compile_mode == "compile-only": + exit(0) + +# Stochastically sample num_sample results, and pass to runtime backend for verification. +num_samples = 100 +sampled_indices = np.vstack( + [ + np.random.randint(0, args.M, num_samples), # i indices + np.random.randint(0, args.N, num_samples), # j indices + ] +) + +# Compute reference results for sampled indices +sampled_values = np.array( + [ + np.sum( + (input_a[i, :].astype(np.float32) * input_b[:, j].astype(np.float32)), + dtype=np.float32, ) - ) + for i, j in zip(*sampled_indices) + ], + dtype=np.float32, +) -elif args.compile_mode == "compile-only": - ###### Compile only - backend = XRTBackend( - verbose=args.verbose, - omit_while_true_loop=False, - output_format=args.output_format, - instance_name="forward", - runtime_loop_tiling_sizes=[4, 4], +# Store as a dictionary +sampled_data = { + "shape": (args.M, args.N), + "indices": sampled_indices, + "values": sampled_values, +} + +runtime = get_air_runtime() +dtype = sampled_data["values"].dtype +shape = sampled_data["shape"] +if isinstance(shape, int): + shape = (shape,) +io_args = [ + aie.utils.tensor(input_a), + aie.utils.tensor(input_b), + aie.utils.tensor(np.zeros(shape, dtype)), +] +exit( + runtime.run_test( + npu_kernel, io_args, refs={}, stochastic_refs=[sampled_data], rtol=1e-1 ) - module_function = backend.compile(air_module) - - backend.unload() +) diff --git a/test/xrt/38_cascade_vecmat_transform_2x4_i32/run.py b/test/xrt/38_cascade_vecmat_transform_2x4_i32/run.py index 48942b6f0..b416370f8 100644 --- a/test/xrt/38_cascade_vecmat_transform_2x4_i32/run.py +++ b/test/xrt/38_cascade_vecmat_transform_2x4_i32/run.py @@ -9,12 +9,10 @@ from air.compiler.util import run_transform import argparse -from air.backend.xrt import XRTBackend +from air.backend.xrt import compile_air, get_air_runtime from air.ir import * import air.passmanager - -from air.backend.xrt_runner import XRTRunner -from air.backend.xrt import XRTBackend +import aie.utils import numpy as np @@ -161,55 +159,54 @@ def forward(lhs, rhs): max_val = 1024 # Conservative estimate input_a = np.random.randint(1, max_val + 1, size=(1, K), dtype=np.int32) input_b = np.random.randint(1, max_val + 1, size=(K, N), dtype=np.int32) -if args.compile_mode == "compile-and-run": - # Stochastically sample num_sample results, and pass to XRTRunner backend for verification. - num_samples = 100 - sampled_indices = np.vstack( - [ - np.random.randint(0, 1, num_samples), # i indices - np.random.randint(0, args.N, num_samples), # j indices - ] - ) - # Compute reference results for sampled indices - sampled_values = np.array( - [ - np.sum( - (input_a[i, :].astype(np.int32) * input_b[:, j].astype(np.int32)), - dtype=np.int32, - ) - for i, j in zip(*sampled_indices) - ], - dtype=np.int32, - ) - print(sampled_values) - - # Store as a dictionary - sampled_data = { - "shape": (1, args.N), - "indices": sampled_indices, - "values": sampled_values, - } - runner = XRTRunner( - verbose=args.verbose, - omit_while_true_loop=False, - runtime_loop_tiling_sizes=[4, 4], - ) - exit( - runner.run_test( - air_module, - inputs=[input_a, input_b], - stochastic_expected_outputs=[sampled_data], - ) - ) +npu_kernel = compile_air( + air_module, + verbose=args.verbose, + omit_while_true_loop=False, + runtime_loop_tiling_sizes=[4, 4], +) -elif args.compile_mode == "compile-only": - ###### Compile only - backend = XRTBackend( - verbose=args.verbose, - omit_while_true_loop=False, - runtime_loop_tiling_sizes=[4, 4], - ) - module_function = backend.compile(air_module) +if args.compile_mode == "compile-only": + exit(0) - backend.unload() +# Stochastically sample num_sample results, and pass to runtime backend for verification. +num_samples = 100 +sampled_indices = np.vstack( + [ + np.random.randint(0, 1, num_samples), # i indices + np.random.randint(0, args.N, num_samples), # j indices + ] +) + +# Compute reference results for sampled indices +sampled_values = np.array( + [ + np.sum( + (input_a[i, :].astype(np.int32) * input_b[:, j].astype(np.int32)), + dtype=np.int32, + ) + for i, j in zip(*sampled_indices) + ], + dtype=np.int32, +) +print(sampled_values) + +# Store as a dictionary +sampled_data = { + "shape": (1, args.N), + "indices": sampled_indices, + "values": sampled_values, +} + +runtime = get_air_runtime() +dtype = sampled_data["values"].dtype +shape = sampled_data["shape"] +if isinstance(shape, int): + shape = (shape,) +io_args = [ + aie.utils.tensor(input_a), + aie.utils.tensor(input_b), + aie.utils.tensor(np.zeros(shape, dtype)), +] +exit(runtime.run_test(npu_kernel, io_args, refs={}, stochastic_refs=[sampled_data])) diff --git a/test/xrt/39_triton_matmul_ver3_vectorized/run.py b/test/xrt/39_triton_matmul_ver3_vectorized/run.py index 5099f2452..8f2a8128a 100644 --- a/test/xrt/39_triton_matmul_ver3_vectorized/run.py +++ b/test/xrt/39_triton_matmul_ver3_vectorized/run.py @@ -4,13 +4,12 @@ # SPDX-License-Identifier: MIT import argparse -from air.backend.xrt import XRTBackend -from air.backend.xrt_runner import XRTRunner +from air.backend.xrt import compile_air, get_air_runtime from air.compiler.util import run_transform from air.ir import * import air.passmanager from ml_dtypes import bfloat16 -import filelock +import aie.utils import numpy as np @@ -125,15 +124,13 @@ C = np.matmul(A, B).astype(output_type) # Shape [M, N] ###### Compile and test - runner = XRTRunner( - omit_while_true_loop=False, - runtime_loop_tiling_sizes=[4, 4], - ) - exit( - runner.run_test( - air_module, - inputs=[A, B], - expected_outputs=[C], - rtol=1e-3, - ) + npu_kernel = compile_air( + air_module, omit_while_true_loop=False, runtime_loop_tiling_sizes=[4, 4] ) + runtime = get_air_runtime() + io_args = [ + aie.utils.tensor(A), + aie.utils.tensor(B), + aie.utils.tensor(np.zeros(C.shape, C.dtype)), + ] + exit(runtime.run_test(npu_kernel, io_args, refs={2: C}, rtol=1e-3)) diff --git a/test/xrt/40_triton_vec_add/run.py b/test/xrt/40_triton_vec_add/run.py index 0700e59eb..28e64c026 100644 --- a/test/xrt/40_triton_vec_add/run.py +++ b/test/xrt/40_triton_vec_add/run.py @@ -4,13 +4,12 @@ # SPDX-License-Identifier: MIT import argparse -from air.backend.xrt import XRTBackend -from air.backend.xrt_runner import XRTRunner +from air.backend.xrt import compile_air, get_air_runtime from air.compiler.util import run_transform from air.ir import * import air.passmanager from ml_dtypes import bfloat16 -import filelock +import aie.utils import numpy as np np.random.seed(42) @@ -130,16 +129,16 @@ C = np.add(A, B).astype(output_type) # Shape [M] ###### Compile and test - runner = XRTRunner( + npu_kernel = compile_air( + air_module, omit_while_true_loop=False, use_lock_race_condition_fix=True, runtime_loop_tiling_sizes=[4, 4], ) - exit( - runner.run_test( - air_module, - inputs=[A, B], - expected_outputs=[C], - rtol=1e-2, - ) - ) + runtime = get_air_runtime() + io_args = [ + aie.utils.tensor(A), + aie.utils.tensor(B), + aie.utils.tensor(np.zeros(C.shape, C.dtype)), + ] + exit(runtime.run_test(npu_kernel, io_args, refs={2: C}, rtol=1e-2)) diff --git a/test/xrt/41_triton_softmax/run.py b/test/xrt/41_triton_softmax/run.py index 392a9ef32..b8f299ed0 100644 --- a/test/xrt/41_triton_softmax/run.py +++ b/test/xrt/41_triton_softmax/run.py @@ -7,12 +7,11 @@ import numpy as np np.random.seed(42) -from air.backend.xrt import XRTBackend -from air.backend.xrt_runner import XRTRunner +from air.backend.xrt import compile_air, get_air_runtime from air.compiler.util import run_transform from air.ir import * import air.passmanager -import filelock +import aie.utils parser = argparse.ArgumentParser( prog="run.py", @@ -200,35 +199,18 @@ def softmax(x, axis=-1): # Run compile and load ############################################### + npu_kernel = compile_air( + air_module, omit_while_true_loop=False, runtime_loop_tiling_sizes=[4, 4] + ) + if args.compile_only: - # Compile-only mode: generate xclbin and instruction binary without validation - print("Compile-only mode: generating xclbin and instruction binary...") - backend = XRTBackend( - omit_while_true_loop=False, runtime_loop_tiling_sizes=[4, 4] - ) - module_function = backend.compile(air_module) - backend.unload() - print("Compilation complete. Generated files:") - print(" - air.xclbin") - print(" - air.insts.bin") - print("Run profiling with: ./test.exe") exit(0) - else: - # Normal mode: compile and run validation - input_type = np.float32 - A = np.random.rand(M, N).astype(input_type) # Shape [M, N] - C = softmax(A).astype(input_type) - - ###### Compile and test - runner = XRTRunner( - omit_while_true_loop=False, - runtime_loop_tiling_sizes=[4, 4], - ) - exit( - runner.run_test( - air_module, - inputs=[A], - expected_outputs=[C], - rtol=1e-2, - ) - ) + + # Normal mode: compile and run validation + input_type = np.float32 + A = np.random.rand(M, N).astype(input_type) # Shape [M, N] + C = softmax(A).astype(input_type) + + runtime = get_air_runtime() + io_args = [aie.utils.tensor(A), aie.utils.tensor(np.zeros(C.shape, C.dtype))] + exit(runtime.run_test(npu_kernel, io_args, refs={1: C}, rtol=1e-2)) diff --git a/test/xrt/42_triton_softmax_bf16/run.py b/test/xrt/42_triton_softmax_bf16/run.py index 40f01d479..9a03b3af3 100644 --- a/test/xrt/42_triton_softmax_bf16/run.py +++ b/test/xrt/42_triton_softmax_bf16/run.py @@ -7,12 +7,11 @@ import numpy as np np.random.seed(42) -from air.backend.xrt import XRTBackend -from air.backend.xrt_runner import XRTRunner +from air.backend.xrt import compile_air, get_air_runtime from air.compiler.util import run_transform from air.ir import * import air.passmanager -import filelock +import aie.utils from ml_dtypes import bfloat16 parser = argparse.ArgumentParser( @@ -201,18 +200,13 @@ def softmax(x, axis=-1): C = softmax(A).astype(input_type) ###### Compile and test - runner = XRTRunner( + npu_kernel = compile_air( + air_module, omit_while_true_loop=False, output_format=args.output_format, instance_name="softmax_kernel", runtime_loop_tiling_sizes=[4, 4], ) - exit( - runner.run_test( - air_module, - inputs=[A], - expected_outputs=[C], - rtol=1e-2, - atol=1e-3, - ) - ) + runtime = get_air_runtime() + io_args = [aie.utils.tensor(A), aie.utils.tensor(np.zeros(C.shape, C.dtype))] + exit(runtime.run_test(npu_kernel, io_args, refs={1: C}, rtol=1e-2, atol=1e-3)) diff --git a/test/xrt/43_triton_layernorm/run.py b/test/xrt/43_triton_layernorm/run.py index 43ce473e8..2baa7346b 100644 --- a/test/xrt/43_triton_layernorm/run.py +++ b/test/xrt/43_triton_layernorm/run.py @@ -5,12 +5,11 @@ import argparse import numpy as np -from air.backend.xrt import XRTBackend -from air.backend.xrt_runner import XRTRunner +from air.backend.xrt import compile_air, get_air_runtime from air.compiler.util import run_transform from air.ir import * import air.passmanager -import filelock +import aie.utils parser = argparse.ArgumentParser( prog="run.py", @@ -295,7 +294,8 @@ def layer_norm(x_arg, y, weight, bias, mean, rstd, eps=1e-5): y_expected = (x_arg - mean.reshape(-1, 1)) * rstd.reshape(-1, 1) # Shape [M, N] ###### Compile and test - runner = XRTRunner( + npu_kernel = compile_air( + air_module, omit_while_true_loop=False, output_format=args.output_format, instance_name="_layer_norm_fwd_fused", @@ -304,11 +304,16 @@ def layer_norm(x_arg, y, weight, bias, mean, rstd, eps=1e-5): debug_ir=args.debug_ir, runtime_loop_tiling_sizes=[4, 4], ) + runtime = get_air_runtime() + io_args = [ + aie.utils.tensor(x_arg), + aie.utils.tensor(np.zeros(y_expected.shape, y_expected.dtype)), + ] exit( - runner.run_test( - air_module, - inputs=[x_arg], - expected_outputs=[y_expected], + runtime.run_test( + npu_kernel, + io_args, + refs={1: y_expected}, rtol=5e-2 if args.bf16_emulation else 1e-2, atol=5e-1 if args.bf16_emulation else 1e-1, ) diff --git a/test/xrt/44_triton_matmul_ver4_vector_ptr_opt/run.py b/test/xrt/44_triton_matmul_ver4_vector_ptr_opt/run.py index 0f3414561..1264eb960 100644 --- a/test/xrt/44_triton_matmul_ver4_vector_ptr_opt/run.py +++ b/test/xrt/44_triton_matmul_ver4_vector_ptr_opt/run.py @@ -4,13 +4,12 @@ # SPDX-License-Identifier: MIT import argparse -from air.backend.xrt import XRTBackend -from air.backend.xrt_runner import XRTRunner +from air.backend.xrt import compile_air, get_air_runtime from air.compiler.util import run_transform from air.ir import * import air.passmanager from ml_dtypes import bfloat16 -import filelock +import aie.utils import numpy as np @@ -133,17 +132,17 @@ C = np.matmul(A, B).astype(output_type) # Shape [M, N] ###### Compile and test - runner = XRTRunner( + npu_kernel = compile_air( + air_module, omit_while_true_loop=False, runtime_loop_tiling_sizes=[4, 4], output_format=args.output_format, instance_name="bare_matmul", ) - exit( - runner.run_test( - air_module, - inputs=[A, B], - expected_outputs=[C], - rtol=1e-1, - ) - ) + runtime = get_air_runtime() + io_args = [ + aie.utils.tensor(A), + aie.utils.tensor(B), + aie.utils.tensor(np.zeros(C.shape, C.dtype)), + ] + exit(runtime.run_test(npu_kernel, io_args, refs={2: C}, rtol=1e-1)) diff --git a/test/xrt/45_triton_matmul_ver4_strix_8x4/run.py b/test/xrt/45_triton_matmul_ver4_strix_8x4/run.py index fb2ca22dd..b42523778 100644 --- a/test/xrt/45_triton_matmul_ver4_strix_8x4/run.py +++ b/test/xrt/45_triton_matmul_ver4_strix_8x4/run.py @@ -4,13 +4,12 @@ # SPDX-License-Identifier: MIT import argparse -from air.backend.xrt import XRTBackend -from air.backend.xrt_runner import XRTRunner +from air.backend.xrt import compile_air, get_air_runtime from air.compiler.util import run_transform from air.ir import * import air.passmanager from ml_dtypes import bfloat16 -import filelock +import aie.utils import numpy as np @@ -134,17 +133,17 @@ C = np.matmul(A, B).astype(output_type) # Shape [M, N] ###### Compile and test - runner = XRTRunner( + npu_kernel = compile_air( + air_module, omit_while_true_loop=False, runtime_loop_tiling_sizes=[4, 4], output_format=args.output_format, instance_name="bare_matmul", ) - exit( - runner.run_test( - air_module, - inputs=[A, B], - expected_outputs=[C], - rtol=1e-1, - ) - ) + runtime = get_air_runtime() + io_args = [ + aie.utils.tensor(A), + aie.utils.tensor(B), + aie.utils.tensor(np.zeros(C.shape, C.dtype)), + ] + exit(runtime.run_test(npu_kernel, io_args, refs={2: C}, rtol=1e-1)) diff --git a/test/xrt/46_triton_matmul_ver4_strix_8x4_i8_i8_i32/run.py b/test/xrt/46_triton_matmul_ver4_strix_8x4_i8_i8_i32/run.py index 83c7cdf03..1f1d75723 100644 --- a/test/xrt/46_triton_matmul_ver4_strix_8x4_i8_i8_i32/run.py +++ b/test/xrt/46_triton_matmul_ver4_strix_8x4_i8_i8_i32/run.py @@ -7,12 +7,11 @@ import numpy as np np.random.seed(42) -from air.backend.xrt import XRTBackend -from air.backend.xrt_runner import XRTRunner +from air.backend.xrt import compile_air, get_air_runtime from air.compiler.util import run_transform from air.ir import * import air.passmanager -import filelock +import aie.utils from ml_dtypes import bfloat16 as bfloat16_t parser = argparse.ArgumentParser( @@ -130,54 +129,31 @@ # Run compile and load ############################################### - # Determine output file extension based on format - output_ext = "elf" if args.output_format == "elf" else "xclbin" + npu_kernel = compile_air( + air_module, + omit_while_true_loop=False, + output_format=args.output_format, + instance_name="bare_matmul", + runtime_loop_tiling_sizes=[4, 4], + ) if args.compile_only: - # Compile-only mode: generate binary without validation - print(f"Compile-only mode: generating {output_ext} binary...") - backend = XRTBackend( - omit_while_true_loop=False, - output_format=args.output_format, - instance_name="bare_matmul", - runtime_loop_tiling_sizes=[4, 4], - ) - module_function = backend.compile(air_module) - backend.unload() - print("Compilation complete. Generated files:") - print(f" - air.{output_ext}") - if args.output_format == "xclbin": - print(" - air.insts.bin") - print("Run profiling with: ./test.exe") exit(0) - else: - # Normal mode: compile and run validation - - input_type = np.int8 - output_type = np.int32 - A = np.random.randint( - low=0, high=8, size=(M, K), dtype=input_type - ) # Shape [M, K] - B = np.random.randint( - low=0, high=8, size=(K, N), dtype=input_type - ) # Shape [K, N] - - C = np.matmul(A.astype(output_type), B.astype(output_type)).astype( - output_type - ) # Shape [M, N] - - runner = XRTRunner( - omit_while_true_loop=False, - runtime_loop_tiling_sizes=[4, 4], - output_format=args.output_format, - instance_name="bare_matmul", - # verbose=True, - ) - exit( - runner.run_test( - air_module, - inputs=[A, B], - expected_outputs=[C], - # rtol=1e-1, - ) - ) + + # Normal mode: compile and run validation + input_type = np.int8 + output_type = np.int32 + A = np.random.randint(low=0, high=8, size=(M, K), dtype=input_type) # Shape [M, K] + B = np.random.randint(low=0, high=8, size=(K, N), dtype=input_type) # Shape [K, N] + + C = np.matmul(A.astype(output_type), B.astype(output_type)).astype( + output_type + ) # Shape [M, N] + + runtime = get_air_runtime() + io_args = [ + aie.utils.tensor(A), + aie.utils.tensor(B), + aie.utils.tensor(np.zeros(C.shape, C.dtype)), + ] + exit(runtime.run_test(npu_kernel, io_args, refs={2: C})) diff --git a/test/xrt/47_multi_launch_pdi_reconfig/run.py b/test/xrt/47_multi_launch_pdi_reconfig/run.py index 90f405955..77e0a2f5a 100644 --- a/test/xrt/47_multi_launch_pdi_reconfig/run.py +++ b/test/xrt/47_multi_launch_pdi_reconfig/run.py @@ -5,13 +5,14 @@ # SPDX-License-Identifier: MIT """ -Test script that uses XRTRunner to compile, run, and validate the +Test script that compiles, runs, and validates the multi-launch PDI reconfiguration example using ELF output format. """ import numpy as np -from air.backend.xrt_runner import XRTRunner +from air.backend.xrt import compile_air, get_air_runtime from air.ir import * +import aie.utils # Define the AIR module with two air.launch operations using iteration spaces # - Launch 1 (add_two): iterates 8 times, processing tiles at offsets 0,16,32,...,112 (adds 2) @@ -157,21 +158,19 @@ def main(): with Context() as ctx, Location.unknown(): air_module = Module.parse(air_tiled_ir_string) - # Create XRTRunner with ELF output format - # instance_name should match the func.func name (@reconfigure_example) - runner = XRTRunner( + npu_kernel = compile_air( + air_module, output_format="elf", - instance_name="reconfigure_example", # matches func.func @reconfigure_example + instance_name="reconfigure_example", omit_while_true_loop=False, runtime_loop_tiling_sizes=[4, 4], ) - - # Run the test - result = runner.run_test( - mlir_module=air_module, - inputs=[input_data], - expected_outputs=[expected_output], - ) + runtime = get_air_runtime() + io_args = [ + aie.utils.tensor(input_data), + aie.utils.tensor(np.zeros(expected_output.shape, expected_output.dtype)), + ] + result = runtime.run_test(npu_kernel, io_args, refs={1: expected_output}) return result diff --git a/test/xrt/48_triton_matmul_ver4_strix_4x4_bf16_output/run.py b/test/xrt/48_triton_matmul_ver4_strix_4x4_bf16_output/run.py index b6904605a..114e0ac2d 100644 --- a/test/xrt/48_triton_matmul_ver4_strix_4x4_bf16_output/run.py +++ b/test/xrt/48_triton_matmul_ver4_strix_4x4_bf16_output/run.py @@ -4,13 +4,12 @@ # SPDX-License-Identifier: MIT import argparse -from air.backend.xrt import XRTBackend -from air.backend.xrt_runner import XRTRunner +from air.backend.xrt import compile_air, get_air_runtime from air.compiler.util import run_transform from air.ir import * import air.passmanager from ml_dtypes import bfloat16 -import filelock +import aie.utils import numpy as np @@ -130,15 +129,13 @@ C = np.matmul(A, B).astype(output_type) # Shape [M, N] ###### Compile and test - runner = XRTRunner( - omit_while_true_loop=False, - runtime_loop_tiling_sizes=[4, 4], - ) - exit( - runner.run_test( - air_module, - inputs=[A, B], - expected_outputs=[C], - rtol=1e-1, - ) + npu_kernel = compile_air( + air_module, omit_while_true_loop=False, runtime_loop_tiling_sizes=[4, 4] ) + runtime = get_air_runtime() + io_args = [ + aie.utils.tensor(A), + aie.utils.tensor(B), + aie.utils.tensor(np.zeros(C.shape, C.dtype)), + ] + exit(runtime.run_test(npu_kernel, io_args, refs={2: C}, rtol=1e-1)) diff --git a/test/xrt/49_triton_softmax_optimized_bf16_strix/run.py b/test/xrt/49_triton_softmax_optimized_bf16_strix/run.py index 130465fa5..615552fdb 100644 --- a/test/xrt/49_triton_softmax_optimized_bf16_strix/run.py +++ b/test/xrt/49_triton_softmax_optimized_bf16_strix/run.py @@ -11,12 +11,11 @@ import numpy as np np.random.seed(42) -from air.backend.xrt import XRTBackend -from air.backend.xrt_runner import XRTRunner +from air.backend.xrt import compile_air, get_air_runtime from air.compiler.util import run_transform from air.ir import * import air.passmanager -import filelock +import aie.utils from ml_dtypes import bfloat16 # Get the directory containing this script @@ -258,48 +257,35 @@ def softmax(x, axis=-1): # Run compile and load ############################################### + npu_kernel = compile_air( + air_module, + omit_while_true_loop=False, + verbose=args.verbose, + debug_ir=args.debug_aircc, + output_format=args.output_format, + instance_name="softmax_kernel", + runtime_loop_tiling_sizes=[4, 4], + ) + if args.compile_only: - # Compile-only mode: generate xclbin and instruction binary without validation - print("Compile-only mode: generating xclbin and instruction binary...") - backend = XRTBackend( - omit_while_true_loop=False, - verbose=args.verbose, - debug_ir=args.debug_aircc, - output_format=args.output_format, - instance_name="softmax_kernel", - runtime_loop_tiling_sizes=[4, 4], - ) - module_function = backend.compile(air_module) - backend.unload() - print("Compilation complete. Generated files:") - print(" - air.xclbin") - print(" - air.insts.bin") - print("Run profiling with: ./test.exe") exit(0) - else: - # Normal mode: compile and run validation - input_type = bfloat16 - # Generate random input in range [-512, 512] - A = (np.random.rand(M, N) * 1024 - 512).astype( - input_type - ) # Shape [M, N], range [-512, 512] - C = softmax(A).astype(input_type) - ###### Compile and test - runner = XRTRunner( - omit_while_true_loop=False, - verbose=args.verbose, - debug_ir=args.debug_aircc, - output_format=args.output_format, - instance_name="softmax_kernel", - runtime_loop_tiling_sizes=[4, 4], - ) - exit( - runner.run_test( - air_module, - inputs=[A], - expected_outputs=[C], - rtol=0.04, # 4% relative tolerance (matches mlir-aie reference) - atol=0.001, # Absolute tolerance (matches mlir-aie reference) - ) + # Normal mode: compile and run validation + input_type = bfloat16 + # Generate random input in range [-512, 512] + A = (np.random.rand(M, N) * 1024 - 512).astype( + input_type + ) # Shape [M, N], range [-512, 512] + C = softmax(A).astype(input_type) + + runtime = get_air_runtime() + io_args = [aie.utils.tensor(A), aie.utils.tensor(np.zeros(C.shape, C.dtype))] + exit( + runtime.run_test( + npu_kernel, + io_args, + refs={1: C}, + rtol=0.04, # 4% relative tolerance (matches mlir-aie reference) + atol=0.001, ) + ) # Absolute tolerance (matches mlir-aie reference) diff --git a/test/xrt/50_multi_launch_attention/run.py b/test/xrt/50_multi_launch_attention/run.py index 928a5843b..791658428 100644 --- a/test/xrt/50_multi_launch_attention/run.py +++ b/test/xrt/50_multi_launch_attention/run.py @@ -15,11 +15,12 @@ 4. Comparing results """ -from air.backend.xrt_runner import XRTRunner +from air.backend.xrt import compile_air, get_air_runtime from air.ir import * from ml_dtypes import bfloat16 import numpy as np import os +import aie.utils def softmax(x, axis=-1): @@ -74,18 +75,21 @@ def softmax(x, axis=-1): # Run test ############################################### - runner = XRTRunner( + npu_kernel = compile_air( + air_module, output_format="elf", - instance_name="attention", # matches func.func @attention + instance_name="attention", omit_while_true_loop=False, verbose=False, runtime_loop_tiling_sizes=[4, 4], ) - exit( - runner.run_test( - mlir_module=air_module, - inputs=[Q, K_T, V, S_buffer, P_buffer], - expected_outputs=[O_expected], - atol=2e3, - ) - ) + runtime = get_air_runtime() + io_args = [ + aie.utils.tensor(Q), + aie.utils.tensor(K_T), + aie.utils.tensor(V), + aie.utils.tensor(S_buffer), + aie.utils.tensor(P_buffer), + aie.utils.tensor(np.zeros(O_expected.shape, O_expected.dtype)), + ] + exit(runtime.run_test(npu_kernel, io_args, refs={5: O_expected}, atol=2e3)) diff --git a/test/xrt/51_scf_if_channel_herd/run.py b/test/xrt/51_scf_if_channel_herd/run.py index 4d5d4cbb4..d21118de4 100644 --- a/test/xrt/51_scf_if_channel_herd/run.py +++ b/test/xrt/51_scf_if_channel_herd/run.py @@ -21,7 +21,9 @@ from air.dialects.func import FuncOp from air.dialects import arith, scf from air.dialects.scf import for_, yield_ -from air.backend.xrt_runner import XRTRunner, type_mapper +from air.backend.xrt import compile_air, get_air_runtime +from air.backend.xrt_runner import type_mapper +import aie.utils range_ = for_ @@ -143,10 +145,16 @@ def herd_body(tx, ty, sx, sy): input_a = np.full(IMAGE_SIZE, 0x2, dtype=INOUT_DATATYPE) output_b = np.full(IMAGE_SIZE, 0x5, dtype=INOUT_DATATYPE) - runner = XRTRunner( + npu_kernel = compile_air( + mlir_module, verbose=args.verbose, output_format=args.output_format, instance_name="copy", runtime_loop_tiling_sizes=[4, 4], ) - exit(runner.run_test(mlir_module, inputs=[input_a], expected_outputs=[output_b])) + runtime = get_air_runtime() + io_args = [ + aie.utils.tensor(input_a), + aie.utils.tensor(np.zeros(output_b.shape, output_b.dtype)), + ] + exit(runtime.run_test(npu_kernel, io_args, refs={1: output_b})) diff --git a/test/xrt/52_dma_pad_passthrough/run.py b/test/xrt/52_dma_pad_passthrough/run.py index 85dc000c4..a8aaaca65 100644 --- a/test/xrt/52_dma_pad_passthrough/run.py +++ b/test/xrt/52_dma_pad_passthrough/run.py @@ -22,7 +22,9 @@ from air.dialects import arith from air.dialects.memref import AllocOp, DeallocOp from air.dialects.func import FuncOp -from air.backend.xrt_runner import XRTRunner, type_mapper +from air.backend.xrt import compile_air, get_air_runtime +from air.backend.xrt_runner import type_mapper +import aie.utils INPUT_ROWS = 64 INPUT_COLS = 480 @@ -189,16 +191,16 @@ def herd_body(tx, ty, sx, sy, h_l2_in, h_l2_out): "values": sampled_values, } - runner = XRTRunner( + npu_kernel = compile_air( + mlir_module, verbose=args.verbose, output_format=args.output_format, instance_name="pad_passthrough", runtime_loop_tiling_sizes=[4, 4], ) - exit( - runner.run_test( - mlir_module, - inputs=[input_data], - stochastic_expected_outputs=[sampled_data], - ) - ) + runtime = get_air_runtime() + io_args = [ + aie.utils.tensor(input_data), + aie.utils.tensor(np.zeros((INPUT_ROWS, PADDED_COLS), INOUT_DATATYPE)), + ] + exit(runtime.run_test(npu_kernel, io_args, refs={}, stochastic_refs=[sampled_data])) diff --git a/test/xrt/53_matmul_padding_bf16/run.py b/test/xrt/53_matmul_padding_bf16/run.py index 8634ef1b6..320410c29 100644 --- a/test/xrt/53_matmul_padding_bf16/run.py +++ b/test/xrt/53_matmul_padding_bf16/run.py @@ -14,13 +14,12 @@ import argparse import math -from air.backend.xrt import XRTBackend -from air.backend.xrt_runner import XRTRunner +from air.backend.xrt import compile_air, get_air_runtime from air.compiler.util import run_transform from air.ir import * import air.passmanager from ml_dtypes import bfloat16 -import filelock +import aie.utils import numpy as np @@ -237,42 +236,44 @@ C_ref = np.zeros((M_padded, N_padded), dtype=output_type) C_ref[:M_actual, :N_actual] = C_ref_actual - if args.compile_mode == "compile-and-run": - runner = XRTRunner( - verbose=args.verbose, - omit_while_true_loop=False, - runtime_loop_tiling_sizes=[4, 4], - output_format="elf" if needs_padding else "xclbin", - instance_name="matmul_bf16", - ) + npu_kernel = compile_air( + air_module, + verbose=args.verbose, + omit_while_true_loop=False, + runtime_loop_tiling_sizes=[4, 4], + output_format="elf" if needs_padding else "xclbin", + instance_name="matmul_bf16", + ) - num_samples = 200 - sampled_row = np.random.randint(0, M_actual, num_samples) - sampled_col = np.random.randint(0, N_actual, num_samples) - sampled_indices = np.vstack([sampled_row, sampled_col]) - sampled_values = np.array( - [C_ref_actual[r, c] for r, c in zip(sampled_row, sampled_col)], - dtype=output_type, - ) - sampled_data = { - "shape": (M_padded, N_padded), - "indices": sampled_indices, - "values": sampled_values, - } + if args.compile_mode == "compile-only": + exit(0) - exit( - runner.run_test( - air_module, - inputs=[A, B], - stochastic_expected_outputs=[sampled_data], - rtol=max(1e-1, 2e-2 * (K_FULL / K_L2_TILE)), - ) - ) - elif args.compile_mode == "compile-only": - backend = XRTBackend( - verbose=args.verbose, - omit_while_true_loop=False, - runtime_loop_tiling_sizes=[4, 4], + num_samples = 200 + sampled_row = np.random.randint(0, M_actual, num_samples) + sampled_col = np.random.randint(0, N_actual, num_samples) + sampled_indices = np.vstack([sampled_row, sampled_col]) + sampled_values = np.array( + [C_ref_actual[r, c] for r, c in zip(sampled_row, sampled_col)], + dtype=output_type, + ) + sampled_data = { + "shape": (M_padded, N_padded), + "indices": sampled_indices, + "values": sampled_values, + } + + runtime = get_air_runtime() + io_args = [ + aie.utils.tensor(A), + aie.utils.tensor(B), + aie.utils.tensor(np.zeros((M_padded, N_padded), output_type)), + ] + exit( + runtime.run_test( + npu_kernel, + io_args, + refs={}, + stochastic_refs=[sampled_data], + rtol=max(1e-1, 2e-2 * (K_FULL / K_L2_TILE)), ) - module_function = backend.compile(air_module) - backend.unload() + ) diff --git a/test/xrt/54_matmul_padding_f32_bf16_emulation/run.py b/test/xrt/54_matmul_padding_f32_bf16_emulation/run.py index 9ae0e65c8..07ca53838 100644 --- a/test/xrt/54_matmul_padding_f32_bf16_emulation/run.py +++ b/test/xrt/54_matmul_padding_f32_bf16_emulation/run.py @@ -14,12 +14,12 @@ import math import os -from air.backend.xrt import XRTBackend -from air.backend.xrt_runner import XRTRunner +from air.backend.xrt import compile_air, get_air_runtime from air.compiler.util import run_transform from air.ir import * import air.passmanager from ml_dtypes import bfloat16 +import aie.utils import numpy as np @@ -213,83 +213,80 @@ input_b = np.zeros((K_FULL, N_alloc), dtype=np.float32) input_b[:, :N_actual] = (np.random.rand(K_FULL, N_actual) * 4).astype(np.float32) - if args.compile_mode == "compile-and-run": - num_samples = 100 - sampled_indices = np.vstack( + npu_kernel = compile_air( + air_module, + verbose=args.verbose, + omit_while_true_loop=False, + runtime_loop_tiling_sizes=[4, 4], + output_format="elf", + instance_name="matmul_padding_kernel", + bf16_emulation=True, + debug_ir=True, + ) + + if args.compile_mode == "compile-only": + exit(0) + + num_samples = 100 + sampled_indices = np.vstack( + [ + np.random.randint(0, M_actual, num_samples), + np.random.randint(0, N_actual, num_samples), + ] + ) + + # Add deterministic boundary-tile samples to catch padding errors. + boundary_m = list( + set( [ - np.random.randint(0, M_actual, num_samples), - np.random.randint(0, N_actual, num_samples), + min(M_actual - 1, m) + for m in [M_actual - 1, M_actual - TILE_M + 1, 0] + if m >= 0 ] ) - - # Add deterministic boundary-tile samples to catch padding errors. - boundary_m = list( - set( - [ - min(M_actual - 1, m) - for m in [M_actual - 1, M_actual - TILE_M + 1, 0] - if m >= 0 - ] - ) - ) - boundary_n = list( - set( - [ - min(N_actual - 1, n) - for n in [N_actual - 1, N_actual - TILE_N + 1, 0] - if n >= 0 - ] - ) - ) - boundary_indices = np.array([[m, n] for m in boundary_m for n in boundary_n]).T - sampled_indices = np.hstack([sampled_indices, boundary_indices]) - - # Golden: truncate f32 inputs to bf16 (matching hardware truncf_op), - # then compute dot product with f32 accumulation. - input_a_bf16 = input_a.astype(bfloat16) - input_b_bf16 = input_b.astype(bfloat16) - sampled_values = np.array( + ) + boundary_n = list( + set( [ - np.sum( - input_a_bf16[:, i].astype(np.float32) - * input_b_bf16[:, j].astype(np.float32), - dtype=np.float32, - ) - for i, j in zip(*sampled_indices) - ], - dtype=np.float32, + min(N_actual - 1, n) + for n in [N_actual - 1, N_actual - TILE_N + 1, 0] + if n >= 0 + ] ) + ) + boundary_indices = np.array([[m, n] for m in boundary_m for n in boundary_n]).T + sampled_indices = np.hstack([sampled_indices, boundary_indices]) - sampled_data = { - "shape": (M_padded, N_padded), - "indices": sampled_indices, - "values": sampled_values, - } - - runner = XRTRunner( - verbose=args.verbose, - omit_while_true_loop=False, - runtime_loop_tiling_sizes=[4, 4], - output_format="elf", - instance_name="matmul_padding_kernel", - bf16_emulation=True, - debug_ir=True, - ) - exit( - runner.run_test( - air_module, - inputs=[input_a, input_b], - stochastic_expected_outputs=[sampled_data], - rtol=0.1, + # Golden: truncate f32 inputs to bf16 (matching hardware truncf_op), + # then compute dot product with f32 accumulation. + input_a_bf16 = input_a.astype(bfloat16) + input_b_bf16 = input_b.astype(bfloat16) + sampled_values = np.array( + [ + np.sum( + input_a_bf16[:, i].astype(np.float32) + * input_b_bf16[:, j].astype(np.float32), + dtype=np.float32, ) + for i, j in zip(*sampled_indices) + ], + dtype=np.float32, + ) + + sampled_data = { + "shape": (M_padded, N_padded), + "indices": sampled_indices, + "values": sampled_values, + } + + runtime = get_air_runtime() + io_args = [ + aie.utils.tensor(input_a), + aie.utils.tensor(input_b), + aie.utils.tensor(np.zeros((M_padded, N_padded), np.float32)), + ] + exit( + runtime.run_test( + npu_kernel, io_args, refs={}, stochastic_refs=[sampled_data], rtol=0.1 ) - elif args.compile_mode == "compile-only": - backend = XRTBackend( - verbose=args.verbose, - omit_while_true_loop=False, - output_format="elf", - bf16_emulation=True, - runtime_loop_tiling_sizes=[4, 4], - ) - module_function = backend.compile(air_module) - backend.unload() + ) diff --git a/test/xrt/55_matmul_padding_bf16_npu1/run.py b/test/xrt/55_matmul_padding_bf16_npu1/run.py index 18dffe15f..0691c684a 100644 --- a/test/xrt/55_matmul_padding_bf16_npu1/run.py +++ b/test/xrt/55_matmul_padding_bf16_npu1/run.py @@ -28,9 +28,10 @@ from air.dialects.memref import AllocOp, DeallocOp, load, store, subview from air.dialects.func import FuncOp from air.dialects.scf import for_, yield_ -from air.backend.xrt_runner import XRTRunner, type_mapper -from air.backend.xrt import XRTBackend +from air.backend.xrt import compile_air, get_air_runtime +from air.backend.xrt_runner import type_mapper from air.compiler.util import run_transform +import aie.utils from air.extras import types as extrasT from air.dialects.linalg.opdsl.lang import * import air.dialects.linalg.opdsl.lang as linalg_lang @@ -697,17 +698,25 @@ def epilogue_herd( "values": sampled_values, } - runner = XRTRunner( + npu_kernel = compile_air( + mlir_module, verbose=args.verbose, omit_while_true_loop=False, runtime_loop_tiling_sizes=[4, 4], instance_name="matmul_f32", ) + runtime = get_air_runtime() + io_args = [ + aie.utils.tensor(input_a), + aie.utils.tensor(input_b), + aie.utils.tensor(np.zeros((M_padded, N_padded), np.float32)), + ] exit( - runner.run_test( - mlir_module, - inputs=[input_a, input_b], - stochastic_expected_outputs=[sampled_data], + runtime.run_test( + npu_kernel, + io_args, + refs={}, + stochastic_refs=[sampled_data], rtol=0.1, max_mismatch_percentage=10, )