Xilinx · hunhoffe · Mar 27, 2026 · Mar 27, 2026 · Mar 27, 2026 · Mar 27, 2026
@@ -12,11 +12,9 @@
 Uses a 1x2 AIE herd with DMA transfers between L3 and L1 memory.
 """
 
-import argparse
 from ml_dtypes import bfloat16
 
 from air.ir import *
-from air.dialects.affine import apply as affine_apply
 from air.dialects.air import *
 from air.dialects import arith
 from air.dialects.arith import ConstantOp
@@ -29,8 +27,7 @@
 )
 from air.dialects.func import FuncOp
 from air.dialects.scf import for_, yield_
-from air.backend.xrt_runner import XRTRunner, type_mapper
-from air.backend.xrt import XRTBackend
+from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu
 
 import numpy as np
 
@@ -54,16 +51,8 @@ def build_module(m, n, tile_m, np_dtype_in):
     l3outputMemrefTy = MemRefType.get(out_size, xrt_dtype_in)
 
     # L1 MemRefTypes
-    l1MemrefTy = MemRefType.get(
-        shape=[tile_m, n],
-        element_type=xrt_dtype_in,
-        memory_space=IntegerAttr.get(T.i32(), MemorySpace.L1),
-    )
-    l1outputMemrefTy = MemRefType.get(
-        shape=[tile_m, 1],
-        element_type=xrt_dtype_in,
-        memory_space=IntegerAttr.get(T.i32(), MemorySpace.L1),
-    )
+    l1MemrefTy = l1_memref_type([tile_m, n], xrt_dtype_in)
+    l1outputMemrefTy = l1_memref_type([tile_m, 1], xrt_dtype_in)
 
     @FuncOp.from_py_func(l3memrefTy, l3outputMemrefTy)
     def average_pool(arg0, arg2):
@@ -85,20 +74,7 @@ def herd_body(
 
             for _l_ivx in range_(0, m, tile_m * num_tiles):
 
-                offset_map = AffineMap.get(
-                    0,
-                    2,
-                    [
-                        AffineExpr.get_add(
-                            AffineSymbolExpr.get(0),
-                            AffineExpr.get_mul(
-                                AffineSymbolExpr.get(1),
-                                AffineConstantExpr.get(tile_m),
-                            ),
-                        )
-                    ],
-                )
-                offset = affine_apply(offset_map, [_l_ivx, _ty])
+                offset = tile_offset_1d(_l_ivx, _ty, tile_m)
 
                 dma_memcpy_nd(
                     l1_a_data,
@@ -151,16 +127,16 @@ def herd_body(
                     )
                     cst0 = arith.ConstantOp(xrt_dtype_in, 0.0)
                     v_a = transfer_read(
-                        VectorType.get([n], xrt_dtype_in),
+                        vec_type(n, xrt_dtype_in),
                         collapse_a,
                         [c0],
-                        AffineMapAttr.get(AffineMap.get_identity(1)),
+                        identity_map_attr(),
                         cst0,
                         [True],
                     )
                     # Multiply by 1/N before reduction to avoid scalar bf16
                     # multiply which can produce corrupted output on AIE2.
-                    v_inv_n = broadcast(VectorType.get([n], xrt_dtype_in), inv_n)
+                    v_inv_n = broadcast(vec_type(n, xrt_dtype_in), inv_n)
                     v_scaled = arith.mulf(v_a, v_inv_n)
                     v_avg = reduction(xrt_dtype_in, CombiningKind.ADD, v_scaled)
                     store(v_avg, collapse_c, [c0])
@@ -188,20 +164,7 @@ def herd_body(
     TILE_M = 256
     INPUT_DATATYPE = bfloat16
 
-    parser = argparse.ArgumentParser(
-        prog="run.py",
-        description="Builds, runs, and tests the AveragePool example",
-    )
-    parser.add_argument(
-        "-v",
-        "--verbose",
-        action="store_true",
-    )
-    parser.add_argument(
-        "-p",
-        "--print-module-only",
-        action="store_true",
-    )
+    parser = make_air_parser("Builds, runs, and tests the AveragePool example")
     parser.add_argument(
         "--m",
         type=int,
@@ -215,20 +178,6 @@ def herd_body(
         help="Input size (dimension N, pool width)",
     )
     parser.add_argument("--tile-m", type=int, default=TILE_M, help="Tile size M")
-    parser.add_argument(
-        "--compile-mode",
-        type=str,
-        choices=["compile-only", "compile-and-run"],
-        dest="compile_mode",
-        default="compile-and-run",
-    )
-    parser.add_argument(
-        "--output-format",
-        type=str,
-        choices=["xclbin", "elf"],
-        default="xclbin",
-        dest="output_format",
-    )
 
     args = parser.parse_args()
 
@@ -246,46 +195,29 @@ def herd_body(
         args.m, args.n
     )
 
-    if args.compile_mode == "compile-and-run":
-
-        num_samples = 100
-        sampled_indices = np.vstack([np.random.randint(0, args.m, num_samples)])
-
-        # AveragePool reference: sum of (each element * 1/N) per row
-        inv_n_bf16 = INPUT_DATATYPE(1.0 / args.n)
-        sampled_values = np.array(
-            [np.sum(input_a[i] * inv_n_bf16) for i in zip(*sampled_indices)],
-            dtype=INPUT_DATATYPE,
-        )
+    num_samples = 100
+    sampled_indices = np.vstack([np.random.randint(0, args.m, num_samples)])
 
-        sampled_data = {
-            "shape": (args.m,),
-            "indices": sampled_indices,
-            "values": sampled_values,
-        }
+    # AveragePool reference: sum of (each element * 1/N) per row
+    inv_n_bf16 = INPUT_DATATYPE(1.0 / args.n)
+    sampled_values = np.array(
+        [np.sum(input_a[i] * inv_n_bf16) for i in zip(*sampled_indices)],
+        dtype=INPUT_DATATYPE,
+    )
 
-        runner = XRTRunner(
-            verbose=args.verbose,
-            omit_while_true_loop=False,
-            output_format=args.output_format,
+    sampled_data = {
+        "shape": (args.m,),
+        "indices": sampled_indices,
+        "values": sampled_values,
+    }
+
+    exit(
+        run_on_npu(
+            args,
+            mlir_module,
+            inputs=[input_a],
             instance_name="average_pool",
-            runtime_loop_tiling_sizes=[4, 4],
-        )
-        exit(
-            runner.run_test(
-                mlir_module,
-                inputs=[input_a],
-                stochastic_expected_outputs=[sampled_data],
-                rtol=1e-1,
-            )
+            stochastic_expected_outputs=[sampled_data],
+            rtol=1e-1,
         )
-
-    elif args.compile_mode == "compile-only":
-        backend = XRTBackend(
-            verbose=args.verbose,
-            omit_while_true_loop=False,
-            output_format=args.output_format,
-            runtime_loop_tiling_sizes=[4, 4],
-        )
-        module_function = backend.compile(mlir_module)
-        backend.unload()
+    )
@@ -13,20 +13,23 @@
 with configurable VECTOR_SIZE (default 16).
 """
 
-import argparse
+import os
+import sys
+
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
 from ml_dtypes import bfloat16
 
 from air.ir import *
-from air.dialects.affine import apply as affine_apply
 from air.dialects.air import *
 from air.dialects import arith
 from air.dialects.arith import ConstantOp
-from air.dialects.memref import AllocOp, DeallocOp, subview
-from air.dialects.vector import transfer_read, transfer_write, BroadcastOp, fma
+from air.dialects.memref import AllocOp, DeallocOp
+from air.dialects.vector import BroadcastOp, fma
 from air.dialects.func import FuncOp
 from air.dialects.scf import for_, yield_
-from air.backend.xrt_runner import XRTRunner, type_mapper
-from air.backend.xrt import XRTBackend
+from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu
+from utils import vec_read, vec_write
 
 import numpy as np
 
@@ -44,18 +47,10 @@ def build_module(n, tile_n, np_dtype_in, alpha=2.0, vector_size=16):
     VECTOR_SIZE = vector_size
     index_type = IndexType.get()
 
-    # L3 MemRefTypes
     l3memrefTy = MemRefType.get([n], xrt_dtype_in)
-
-    # L1 MemRefTypes
-    l1MemrefTy = MemRefType.get(
-        shape=[tile_n],
-        element_type=xrt_dtype_in,
-        memory_space=IntegerAttr.get(T.i32(), MemorySpace.L1),
-    )
-
-    vecTy = VectorType.get([VECTOR_SIZE], xrt_dtype_in)
-    identity_map = AffineMapAttr.get(AffineMap.get_identity(1))
+    l1MemrefTy = l1_memref_type([tile_n], xrt_dtype_in)
+    vecTy = vec_type(VECTOR_SIZE, xrt_dtype_in)
+    imap = identity_map_attr()
 
     @FuncOp.from_py_func(l3memrefTy, l3memrefTy, l3memrefTy)
     def axpy(arg0, arg1, arg2):
@@ -80,21 +75,7 @@ def herd_body(
             l1_out_data = AllocOp(l1MemrefTy, [], [])
 
             for _l_ivx in range_(0, n, tile_n * num_tiles):
-
-                offset_map = AffineMap.get(
-                    0,
-                    2,
-                    [
-                        AffineExpr.get_add(
-                            AffineSymbolExpr.get(0),
-                            AffineExpr.get_mul(
-                                AffineSymbolExpr.get(1),
-                                AffineConstantExpr.get(tile_n),
-                            ),
-                        )
-                    ],
-                )
-                offset = affine_apply(offset_map, [_l_ivx, _ty])
+                offset = tile_offset_1d(_l_ivx, _ty, tile_n)
 
                 dma_memcpy_nd(
                     l1_x_data,
@@ -121,29 +102,11 @@ def herd_body(
                 v_a = BroadcastOp(vecTy, a_const)
 
                 for j in range_(c0, cTileN, cVecSize):
-                    sub_x = subview(
-                        l1_x_data.result,
-                        [j],
-                        [VECTOR_SIZE],
-                        [1],
-                    )
-                    sub_y = subview(
-                        l1_y_data.result,
-                        [j],
-                        [VECTOR_SIZE],
-                        [1],
-                    )
-                    sub_out = subview(
-                        l1_out_data.result,
-                        [j],
-                        [VECTOR_SIZE],
-                        [1],
-                    )
-                    v_x = transfer_read(vecTy, sub_x, [c0], identity_map, cst0, [True])
-                    v_y = transfer_read(vecTy, sub_y, [c0], identity_map, cst0, [True])
+                    v_x = vec_read(l1_x_data, j, VECTOR_SIZE, c0, vecTy, cst0, imap)
+                    v_y = vec_read(l1_y_data, j, VECTOR_SIZE, c0, vecTy, cst0, imap)
                     # a * x + y via vector.fma
                     v_result = fma(v_a, v_x, v_y)
-                    transfer_write(None, v_result, sub_out, [c0], identity_map, [True])
+                    vec_write(v_result, l1_out_data, j, VECTOR_SIZE, c0, imap)
                     yield_([])
 
                 # Write result from l1_out back to L3 output buffer
@@ -167,12 +130,7 @@ def herd_body(
     INPUT_DATATYPE = bfloat16
     ALPHA = 2.0
 
-    parser = argparse.ArgumentParser(
-        prog="run.py",
-        description="Builds, runs, and tests the AXPY example",
-    )
-    parser.add_argument("-v", "--verbose", action="store_true")
-    parser.add_argument("-p", "--print-module-only", action="store_true")
+    parser = make_air_parser("Builds, runs, and tests the AXPY example")
     parser.add_argument("--n", type=int, default=N, help="Total number of elements")
     parser.add_argument("--tile-n", type=int, default=TILE_N, help="Tile size")
     parser.add_argument(
@@ -184,20 +142,6 @@ def herd_body(
         default=16,
         help="Vector size for SIMD operations",
     )
-    parser.add_argument(
-        "--compile-mode",
-        type=str,
-        choices=["compile-only", "compile-and-run"],
-        dest="compile_mode",
-        default="compile-and-run",
-    )
-    parser.add_argument(
-        "--output-format",
-        type=str,
-        choices=["xclbin", "elf"],
-        default="xclbin",
-        dest="output_format",
-    )
 
     args = parser.parse_args()
 
@@ -211,41 +155,24 @@ def herd_body(
     input_x = np.random.randn(args.n).astype(INPUT_DATATYPE)
     input_y = np.random.randn(args.n).astype(INPUT_DATATYPE)
 
-    if args.compile_mode == "compile-and-run":
-        num_samples = 100
-        sampled_indices = np.vstack([np.random.randint(0, args.n, num_samples)])
-        sampled_values = np.array(
-            [(args.alpha * input_x[i] + input_y[i]) for i in zip(*sampled_indices)],
-            dtype=INPUT_DATATYPE,
-        )
-        sampled_data = {
-            "shape": (args.n,),
-            "indices": sampled_indices,
-            "values": sampled_values,
-        }
-
-        runner = XRTRunner(
-            verbose=args.verbose,
-            omit_while_true_loop=False,
-            output_format=args.output_format,
+    sampled_indices = np.vstack([np.random.randint(0, args.n, 100)])
+    sampled_values = np.array(
+        [args.alpha * input_x[i] + input_y[i] for i in zip(*sampled_indices)],
+        dtype=INPUT_DATATYPE,
+    )
+    sampled_data = {
+        "shape": (args.n,),
+        "indices": sampled_indices,
+        "values": sampled_values,
+    }
+
+    exit(
+        run_on_npu(
+            args,
+            mlir_module,
+            inputs=[input_x, input_y],
             instance_name="axpy",
-            runtime_loop_tiling_sizes=[4, 4],
+            stochastic_expected_outputs=[sampled_data],
+            rtol=1e-2,
         )
-        exit(
-            runner.run_test(
-                mlir_module,
-                inputs=[input_x, input_y],
-                stochastic_expected_outputs=[sampled_data],
-                rtol=1e-2,
-            )
-        )
-
-    elif args.compile_mode == "compile-only":
-        backend = XRTBackend(
-            verbose=args.verbose,
-            omit_while_true_loop=False,
-            output_format=args.output_format,
-            runtime_loop_tiling_sizes=[4, 4],
-        )
-        module_function = backend.compile(mlir_module)
-        backend.unload()
+    )