diff --git a/programming_examples/average_pool/average_pool.py b/programming_examples/average_pool/average_pool.py
index 2fd19c7e7..a55f6220b 100644
--- a/programming_examples/average_pool/average_pool.py
+++ b/programming_examples/average_pool/average_pool.py
@@ -12,11 +12,9 @@
 Uses a 1x2 AIE herd with DMA transfers between L3 and L1 memory.
 """
 
-import argparse
 from ml_dtypes import bfloat16
 
 from air.ir import *
-from air.dialects.affine import apply as affine_apply
 from air.dialects.air import *
 from air.dialects import arith
 from air.dialects.arith import ConstantOp
@@ -29,8 +27,7 @@
 )
 from air.dialects.func import FuncOp
 from air.dialects.scf import for_, yield_
-from air.backend.xrt_runner import XRTRunner, type_mapper
-from air.backend.xrt import XRTBackend
+from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu
 
 import numpy as np
 
@@ -54,16 +51,8 @@ def build_module(m, n, tile_m, np_dtype_in):
     l3outputMemrefTy = MemRefType.get(out_size, xrt_dtype_in)
 
     # L1 MemRefTypes
-    l1MemrefTy = MemRefType.get(
-        shape=[tile_m, n],
-        element_type=xrt_dtype_in,
-        memory_space=IntegerAttr.get(T.i32(), MemorySpace.L1),
-    )
-    l1outputMemrefTy = MemRefType.get(
-        shape=[tile_m, 1],
-        element_type=xrt_dtype_in,
-        memory_space=IntegerAttr.get(T.i32(), MemorySpace.L1),
-    )
+    l1MemrefTy = l1_memref_type([tile_m, n], xrt_dtype_in)
+    l1outputMemrefTy = l1_memref_type([tile_m, 1], xrt_dtype_in)
 
     @FuncOp.from_py_func(l3memrefTy, l3outputMemrefTy)
     def average_pool(arg0, arg2):
@@ -85,20 +74,7 @@ def herd_body(
 
             for _l_ivx in range_(0, m, tile_m * num_tiles):
 
-                offset_map = AffineMap.get(
-                    0,
-                    2,
-                    [
-                        AffineExpr.get_add(
-                            AffineSymbolExpr.get(0),
-                            AffineExpr.get_mul(
-                                AffineSymbolExpr.get(1),
-                                AffineConstantExpr.get(tile_m),
-                            ),
-                        )
-                    ],
-                )
-                offset = affine_apply(offset_map, [_l_ivx, _ty])
+                offset = tile_offset_1d(_l_ivx, _ty, tile_m)
 
                 dma_memcpy_nd(
                     l1_a_data,
@@ -151,16 +127,16 @@ def herd_body(
                     )
                     cst0 = arith.ConstantOp(xrt_dtype_in, 0.0)
                     v_a = transfer_read(
-                        VectorType.get([n], xrt_dtype_in),
+                        vec_type(n, xrt_dtype_in),
                         collapse_a,
                         [c0],
-                        AffineMapAttr.get(AffineMap.get_identity(1)),
+                        identity_map_attr(),
                         cst0,
                         [True],
                     )
                     # Multiply by 1/N before reduction to avoid scalar bf16
                     # multiply which can produce corrupted output on AIE2.
-                    v_inv_n = broadcast(VectorType.get([n], xrt_dtype_in), inv_n)
+                    v_inv_n = broadcast(vec_type(n, xrt_dtype_in), inv_n)
                     v_scaled = arith.mulf(v_a, v_inv_n)
                     v_avg = reduction(xrt_dtype_in, CombiningKind.ADD, v_scaled)
                     store(v_avg, collapse_c, [c0])
@@ -188,20 +164,7 @@ def herd_body(
     TILE_M = 256
     INPUT_DATATYPE = bfloat16
 
-    parser = argparse.ArgumentParser(
-        prog="run.py",
-        description="Builds, runs, and tests the AveragePool example",
-    )
-    parser.add_argument(
-        "-v",
-        "--verbose",
-        action="store_true",
-    )
-    parser.add_argument(
-        "-p",
-        "--print-module-only",
-        action="store_true",
-    )
+    parser = make_air_parser("Builds, runs, and tests the AveragePool example")
     parser.add_argument(
         "--m",
         type=int,
@@ -215,20 +178,6 @@ def herd_body(
         help="Input size (dimension N, pool width)",
     )
     parser.add_argument("--tile-m", type=int, default=TILE_M, help="Tile size M")
-    parser.add_argument(
-        "--compile-mode",
-        type=str,
-        choices=["compile-only", "compile-and-run"],
-        dest="compile_mode",
-        default="compile-and-run",
-    )
-    parser.add_argument(
-        "--output-format",
-        type=str,
-        choices=["xclbin", "elf"],
-        default="xclbin",
-        dest="output_format",
-    )
 
     args = parser.parse_args()
 
@@ -246,46 +195,29 @@ def herd_body(
         args.m, args.n
     )
 
-    if args.compile_mode == "compile-and-run":
-
-        num_samples = 100
-        sampled_indices = np.vstack([np.random.randint(0, args.m, num_samples)])
-
-        # AveragePool reference: sum of (each element * 1/N) per row
-        inv_n_bf16 = INPUT_DATATYPE(1.0 / args.n)
-        sampled_values = np.array(
-            [np.sum(input_a[i] * inv_n_bf16) for i in zip(*sampled_indices)],
-            dtype=INPUT_DATATYPE,
-        )
+    num_samples = 100
+    sampled_indices = np.vstack([np.random.randint(0, args.m, num_samples)])
 
-        sampled_data = {
-            "shape": (args.m,),
-            "indices": sampled_indices,
-            "values": sampled_values,
-        }
+    # AveragePool reference: sum of (each element * 1/N) per row
+    inv_n_bf16 = INPUT_DATATYPE(1.0 / args.n)
+    sampled_values = np.array(
+        [np.sum(input_a[i] * inv_n_bf16) for i in zip(*sampled_indices)],
+        dtype=INPUT_DATATYPE,
+    )
 
-        runner = XRTRunner(
-            verbose=args.verbose,
-            omit_while_true_loop=False,
-            output_format=args.output_format,
+    sampled_data = {
+        "shape": (args.m,),
+        "indices": sampled_indices,
+        "values": sampled_values,
+    }
+
+    exit(
+        run_on_npu(
+            args,
+            mlir_module,
+            inputs=[input_a],
             instance_name="average_pool",
-            runtime_loop_tiling_sizes=[4, 4],
-        )
-        exit(
-            runner.run_test(
-                mlir_module,
-                inputs=[input_a],
-                stochastic_expected_outputs=[sampled_data],
-                rtol=1e-1,
-            )
+            stochastic_expected_outputs=[sampled_data],
+            rtol=1e-1,
         )
-
-    elif args.compile_mode == "compile-only":
-        backend = XRTBackend(
-            verbose=args.verbose,
-            omit_while_true_loop=False,
-            output_format=args.output_format,
-            runtime_loop_tiling_sizes=[4, 4],
-        )
-        module_function = backend.compile(mlir_module)
-        backend.unload()
+    )
diff --git a/programming_examples/axpy/axpy.py b/programming_examples/axpy/axpy.py
index 27d925047..d5d25a5b8 100644
--- a/programming_examples/axpy/axpy.py
+++ b/programming_examples/axpy/axpy.py
@@ -13,20 +13,23 @@
 with configurable VECTOR_SIZE (default 16).
 """
 
-import argparse
+import os
+import sys
+
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
 from ml_dtypes import bfloat16
 
 from air.ir import *
-from air.dialects.affine import apply as affine_apply
 from air.dialects.air import *
 from air.dialects import arith
 from air.dialects.arith import ConstantOp
-from air.dialects.memref import AllocOp, DeallocOp, subview
-from air.dialects.vector import transfer_read, transfer_write, BroadcastOp, fma
+from air.dialects.memref import AllocOp, DeallocOp
+from air.dialects.vector import BroadcastOp, fma
 from air.dialects.func import FuncOp
 from air.dialects.scf import for_, yield_
-from air.backend.xrt_runner import XRTRunner, type_mapper
-from air.backend.xrt import XRTBackend
+from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu
+from utils import vec_read, vec_write
 
 import numpy as np
 
@@ -44,18 +47,10 @@ def build_module(n, tile_n, np_dtype_in, alpha=2.0, vector_size=16):
     VECTOR_SIZE = vector_size
     index_type = IndexType.get()
 
-    # L3 MemRefTypes
     l3memrefTy = MemRefType.get([n], xrt_dtype_in)
-
-    # L1 MemRefTypes
-    l1MemrefTy = MemRefType.get(
-        shape=[tile_n],
-        element_type=xrt_dtype_in,
-        memory_space=IntegerAttr.get(T.i32(), MemorySpace.L1),
-    )
-
-    vecTy = VectorType.get([VECTOR_SIZE], xrt_dtype_in)
-    identity_map = AffineMapAttr.get(AffineMap.get_identity(1))
+    l1MemrefTy = l1_memref_type([tile_n], xrt_dtype_in)
+    vecTy = vec_type(VECTOR_SIZE, xrt_dtype_in)
+    imap = identity_map_attr()
 
     @FuncOp.from_py_func(l3memrefTy, l3memrefTy, l3memrefTy)
     def axpy(arg0, arg1, arg2):
@@ -80,21 +75,7 @@ def herd_body(
             l1_out_data = AllocOp(l1MemrefTy, [], [])
 
             for _l_ivx in range_(0, n, tile_n * num_tiles):
-
-                offset_map = AffineMap.get(
-                    0,
-                    2,
-                    [
-                        AffineExpr.get_add(
-                            AffineSymbolExpr.get(0),
-                            AffineExpr.get_mul(
-                                AffineSymbolExpr.get(1),
-                                AffineConstantExpr.get(tile_n),
-                            ),
-                        )
-                    ],
-                )
-                offset = affine_apply(offset_map, [_l_ivx, _ty])
+                offset = tile_offset_1d(_l_ivx, _ty, tile_n)
 
                 dma_memcpy_nd(
                     l1_x_data,
@@ -121,29 +102,11 @@ def herd_body(
                 v_a = BroadcastOp(vecTy, a_const)
 
                 for j in range_(c0, cTileN, cVecSize):
-                    sub_x = subview(
-                        l1_x_data.result,
-                        [j],
-                        [VECTOR_SIZE],
-                        [1],
-                    )
-                    sub_y = subview(
-                        l1_y_data.result,
-                        [j],
-                        [VECTOR_SIZE],
-                        [1],
-                    )
-                    sub_out = subview(
-                        l1_out_data.result,
-                        [j],
-                        [VECTOR_SIZE],
-                        [1],
-                    )
-                    v_x = transfer_read(vecTy, sub_x, [c0], identity_map, cst0, [True])
-                    v_y = transfer_read(vecTy, sub_y, [c0], identity_map, cst0, [True])
+                    v_x = vec_read(l1_x_data, j, VECTOR_SIZE, c0, vecTy, cst0, imap)
+                    v_y = vec_read(l1_y_data, j, VECTOR_SIZE, c0, vecTy, cst0, imap)
                     # a * x + y via vector.fma
                     v_result = fma(v_a, v_x, v_y)
-                    transfer_write(None, v_result, sub_out, [c0], identity_map, [True])
+                    vec_write(v_result, l1_out_data, j, VECTOR_SIZE, c0, imap)
                     yield_([])
 
                 # Write result from l1_out back to L3 output buffer
@@ -167,12 +130,7 @@ def herd_body(
     INPUT_DATATYPE = bfloat16
     ALPHA = 2.0
 
-    parser = argparse.ArgumentParser(
-        prog="run.py",
-        description="Builds, runs, and tests the AXPY example",
-    )
-    parser.add_argument("-v", "--verbose", action="store_true")
-    parser.add_argument("-p", "--print-module-only", action="store_true")
+    parser = make_air_parser("Builds, runs, and tests the AXPY example")
     parser.add_argument("--n", type=int, default=N, help="Total number of elements")
     parser.add_argument("--tile-n", type=int, default=TILE_N, help="Tile size")
     parser.add_argument(
@@ -184,20 +142,6 @@ def herd_body(
         default=16,
         help="Vector size for SIMD operations",
     )
-    parser.add_argument(
-        "--compile-mode",
-        type=str,
-        choices=["compile-only", "compile-and-run"],
-        dest="compile_mode",
-        default="compile-and-run",
-    )
-    parser.add_argument(
-        "--output-format",
-        type=str,
-        choices=["xclbin", "elf"],
-        default="xclbin",
-        dest="output_format",
-    )
 
     args = parser.parse_args()
 
@@ -211,41 +155,24 @@ def herd_body(
     input_x = np.random.randn(args.n).astype(INPUT_DATATYPE)
     input_y = np.random.randn(args.n).astype(INPUT_DATATYPE)
 
-    if args.compile_mode == "compile-and-run":
-        num_samples = 100
-        sampled_indices = np.vstack([np.random.randint(0, args.n, num_samples)])
-        sampled_values = np.array(
-            [(args.alpha * input_x[i] + input_y[i]) for i in zip(*sampled_indices)],
-            dtype=INPUT_DATATYPE,
-        )
-        sampled_data = {
-            "shape": (args.n,),
-            "indices": sampled_indices,
-            "values": sampled_values,
-        }
-
-        runner = XRTRunner(
-            verbose=args.verbose,
-            omit_while_true_loop=False,
-            output_format=args.output_format,
+    sampled_indices = np.vstack([np.random.randint(0, args.n, 100)])
+    sampled_values = np.array(
+        [args.alpha * input_x[i] + input_y[i] for i in zip(*sampled_indices)],
+        dtype=INPUT_DATATYPE,
+    )
+    sampled_data = {
+        "shape": (args.n,),
+        "indices": sampled_indices,
+        "values": sampled_values,
+    }
+
+    exit(
+        run_on_npu(
+            args,
+            mlir_module,
+            inputs=[input_x, input_y],
             instance_name="axpy",
-            runtime_loop_tiling_sizes=[4, 4],
+            stochastic_expected_outputs=[sampled_data],
+            rtol=1e-2,
         )
-        exit(
-            runner.run_test(
-                mlir_module,
-                inputs=[input_x, input_y],
-                stochastic_expected_outputs=[sampled_data],
-                rtol=1e-2,
-            )
-        )
-
-    elif args.compile_mode == "compile-only":
-        backend = XRTBackend(
-            verbose=args.verbose,
-            omit_while_true_loop=False,
-            output_format=args.output_format,
-            runtime_loop_tiling_sizes=[4, 4],
-        )
-        module_function = backend.compile(mlir_module)
-        backend.unload()
+    )
diff --git a/programming_examples/bottleneck/bottleneck.py b/programming_examples/bottleneck/bottleneck.py
index 5762cdfae..0a7eeba45 100644
--- a/programming_examples/bottleneck/bottleneck.py
+++ b/programming_examples/bottleneck/bottleneck.py
@@ -42,7 +42,6 @@
 enabling zero-copy data transfer between neighboring cores.
 """
 
-import argparse
 import numpy as np
 
 from air.ir import *
@@ -53,8 +52,7 @@
 from air.dialects.memref import AllocOp, DeallocOp
 from air.dialects.func import FuncOp, CallOp
 from air.dialects.scf import for_, yield_
-from air.backend.xrt_runner import XRTRunner, type_mapper
-from air.backend.xrt import XRTBackend
+from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu
 
 range_ = for_
 
@@ -105,62 +103,38 @@ def build_module():
     l3_wts_ty = MemRefType.get((TOTAL_WEIGHTS,), i8)
     l3_act_out_ty = MemRefType.get((ACTIVATIONS_OUT,), i8)
 
-    # L2 memory space
-    l2_mem_space = IntegerAttr.get(i32, MemorySpace.L2)
-
-    # L1 memory space
-    l1_mem_space = IntegerAttr.get(i32, MemorySpace.L1)
-
     # Per-row tile types (processing one row at a time for depth-first dataflow)
     # Layer 1 input: one row of 32 pixels with 256 input channels
-    l1_layer1_in_ty = MemRefType.get(
-        (TENSOR_IN_W, 1, TENSOR_L1_IN_C), i8, memory_space=l1_mem_space
-    )
-    l1_wts_layer1_ty = MemRefType.get((WEIGHTS_L1_SZ,), i8, memory_space=l1_mem_space)
-    l1_layer1_out_ty = MemRefType.get(
-        (TENSOR_IN_W, 1, TENSOR_L1_OUT_C), i8, memory_space=l1_mem_space
-    )
+    l1_layer1_in_ty = l1_memref_type((TENSOR_IN_W, 1, TENSOR_L1_IN_C), i8)
+    l1_wts_layer1_ty = l1_memref_type((WEIGHTS_L1_SZ,), i8)
+    l1_layer1_out_ty = l1_memref_type((TENSOR_IN_W, 1, TENSOR_L1_OUT_C), i8)
 
     # Layer 2 (3x3 conv) types
-    l1_layer2_in_ty = MemRefType.get(
-        (TENSOR_IN_W, 1, TENSOR_L2_IN_C), i8, memory_space=l1_mem_space
-    )
+    l1_layer2_in_ty = l1_memref_type((TENSOR_IN_W, 1, TENSOR_L2_IN_C), i8)
     # L1 weights for layer 2 (36KB fits in AIE2's 64KB L1)
-    l1_wts_layer2_ty = MemRefType.get((WEIGHTS_L2_SZ,), i8, memory_space=l1_mem_space)
+    l1_wts_layer2_ty = l1_memref_type((WEIGHTS_L2_SZ,), i8)
     # Each 3x3 core produces half the output channels
-    l1_layer2_out_ty = MemRefType.get(
-        (TENSOR_IN_W, 1, TENSOR_L2_OUT_C // 2), i8, memory_space=l1_mem_space
-    )
+    l1_layer2_out_ty = l1_memref_type((TENSOR_IN_W, 1, TENSOR_L2_OUT_C // 2), i8)
     # Combined output buffer for both 3x3 conv cores (shared L1, flat 1D)
     # Core 0 writes first 1024 bytes, Core 1 writes next 1024 bytes
     CONV3X3_OUT_HALF_SIZE = TENSOR_IN_W * 1 * (TENSOR_L2_OUT_C // 2)  # 1024
-    l1_layer2_out_combined_ty = MemRefType.get(
-        (CONV3X3_OUT_HALF_SIZE * 2,), i8, memory_space=l1_mem_space
-    )
+    l1_layer2_out_combined_ty = l1_memref_type((CONV3X3_OUT_HALF_SIZE * 2,), i8)
 
     # Layer 3 (1x1 conv + skip) types
-    l1_layer3_in_ty = MemRefType.get(
-        (TENSOR_IN_W, 1, TENSOR_L3_IN_C // 2), i8, memory_space=l1_mem_space
-    )
-    l1_wts_layer3_ty = MemRefType.get((WEIGHTS_L3_SZ,), i8, memory_space=l1_mem_space)
-    l1_layer3_out_ty = MemRefType.get(
-        (TENSOR_IN_W, 1, TENSOR_L3_OUT_C), i8, memory_space=l1_mem_space
-    )
+    l1_layer3_in_ty = l1_memref_type((TENSOR_IN_W, 1, TENSOR_L3_IN_C // 2), i8)
+    l1_wts_layer3_ty = l1_memref_type((WEIGHTS_L3_SZ,), i8)
+    l1_layer3_out_ty = l1_memref_type((TENSOR_IN_W, 1, TENSOR_L3_OUT_C), i8)
 
     # L2 buffer types for skip connection
-    l2_skip_buf_ty = MemRefType.get(
-        (TENSOR_IN_W, 1, TENSOR_L1_IN_C), i8, memory_space=l2_mem_space
-    )
+    l2_skip_buf_ty = l2_memref_type((TENSOR_IN_W, 1, TENSOR_L1_IN_C), i8)
 
     # L2 buffer type for output
-    l2_out_buf_ty = MemRefType.get(
-        (TENSOR_IN_W, 1, TENSOR_L3_OUT_C), i8, memory_space=l2_mem_space
-    )
+    l2_out_buf_ty = l2_memref_type((TENSOR_IN_W, 1, TENSOR_L3_OUT_C), i8)
 
     # L2 buffer types for weight staging
-    l2_wts_layer1_ty = MemRefType.get((WEIGHTS_L1_SZ,), i8, memory_space=l2_mem_space)
-    l2_wts_layer2_ty = MemRefType.get((WEIGHTS_L2_SZ,), i8, memory_space=l2_mem_space)
-    l2_wts_layer3_ty = MemRefType.get((WEIGHTS_L3_SZ,), i8, memory_space=l2_mem_space)
+    l2_wts_layer1_ty = l2_memref_type((WEIGHTS_L1_SZ,), i8)
+    l2_wts_layer2_ty = l2_memref_type((WEIGHTS_L2_SZ,), i8)
+    l2_wts_layer3_ty = l2_memref_type((WEIGHTS_L3_SZ,), i8)
 
     # Declare external convolution kernel functions
     # These would be linked from compiled convolution kernels
@@ -951,29 +925,7 @@ def compute_golden_reference(
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        prog="bottleneck.py",
-        description="Builds, runs, and tests the bottleneck block example",
-    )
-    parser.add_argument(
-        "-v",
-        "--verbose",
-        action="store_true",
-    )
-    parser.add_argument(
-        "-p",
-        "--print-module-only",
-        action="store_true",
-        help="Print MLIR IR and exit",
-    )
-    parser.add_argument(
-        "--compile-mode",
-        type=str,
-        choices=["compile-only", "compile-and-run"],
-        dest="compile_mode",
-        default="compile-and-run",
-        help="Configure whether to run after compile",
-    )
+    parser = make_air_parser("Builds, runs, and tests the bottleneck block example")
     parser.add_argument(
         "--debug-ir",
         action="store_true",
@@ -1059,52 +1011,31 @@ def compute_golden_reference(
         print(f"Expected output shape: {expected_out.shape}")
 
         print("\nRunning AIR bottleneck design...")
-        runner = XRTRunner(
-            verbose=args.verbose,
-            omit_while_true_loop=False,
-            debug_ir=args.debug_ir,
-            omit_pingpong="all",  # Disable all ping-pong to avoid shared buffer sync issues,
-            runtime_loop_tiling_sizes=[4, 4],
-        )
-
-        # Custom comparison with scale factor tolerance
-        def compare_with_tolerance(actual, expected):
-            """Compare outputs with tolerance based on quantization scale."""
-            actual_scaled = actual.astype(np.float32) * inp_scale4
-            expected_scaled = expected.astype(np.float32) * inp_scale4
-
-            if np.allclose(actual_scaled, expected_scaled, rtol=0, atol=inp_scale4):
-                print("\n✓ PASS: Output matches golden reference!")
-                return True
-            else:
-                diff = np.abs(actual_scaled - expected_scaled)
-                print(f"\n✗ FAIL: Output mismatch")
-                print(f"  Max difference: {diff.max():.4f}")
-                print(f"  Mean difference: {diff.mean():.4f}")
-                print(
-                    f"  Mismatched elements: {np.sum(diff > inp_scale4)} / {len(diff)}"
-                )
-                return False
-
         exit(
-            runner.run_test(
+            run_on_npu(
+                args,
                 mlir_module,
                 inputs=[input_act_flat, total_wts],
+                instance_name="bottleneck_block",
                 expected_outputs=[expected_out],
                 rtol=0,
                 atol=1,  # Allow 1 unit of quantization error
+                runtime_loop_tiling_sizes=[4, 4],
+                debug_ir=args.debug_ir,
+                omit_pingpong="all",  # Disable all ping-pong to avoid shared buffer sync issues
             )
         )
 
     elif args.compile_mode == "compile-only":
         print("\nCompiling AIR bottleneck design (no execution)...")
-        backend = XRTBackend(
-            verbose=args.verbose,
-            omit_while_true_loop=False,
-            debug_ir=args.debug_ir,
-            omit_pingpong="all",  # Disable all ping-pong to avoid shared buffer sync issues,
-            runtime_loop_tiling_sizes=[4, 4],
+        exit(
+            run_on_npu(
+                args,
+                mlir_module,
+                inputs=[],
+                instance_name="bottleneck_block",
+                runtime_loop_tiling_sizes=[4, 4],
+                debug_ir=args.debug_ir,
+                omit_pingpong="all",  # Disable all ping-pong to avoid shared buffer sync issues
+            )
         )
-        module_function = backend.compile(mlir_module)
-        backend.unload()
-        print("Compilation successful!")
diff --git a/programming_examples/bottleneck/bottleneck_mlir.py b/programming_examples/bottleneck/bottleneck_mlir.py
index b1a2bcd83..e62a456d5 100644
--- a/programming_examples/bottleneck/bottleneck_mlir.py
+++ b/programming_examples/bottleneck/bottleneck_mlir.py
@@ -57,8 +57,8 @@
 from air.dialects import scf
 from air.dialects.scf import for_, yield_
 from air.dialects import vector as vector_dialect
-from air.backend.xrt_runner import XRTRunner
-from air.backend.xrt import XRTBackend
+import aie.utils
+from air.backend.xrt import compile_air, get_air_runtime
 
 range_ = for_
 
@@ -1542,27 +1542,31 @@ def compare_with_tolerance(actual, expected):
                 return False
 
         # Compile and run directly to get actual outputs for custom comparison
-        # (XRTRunner._check_outputs uses exact match for integers, but AIE2P
-        # SRS positive_inf rounding can differ by 1 from Python's rounding)
+        # (run_test uses atol check, but AIE2P SRS positive_inf rounding can
+        # differ by 1 from Python's rounding, so we use a custom comparison.)
         import filelock
 
-        backend = XRTBackend(
+        npu_kernel = compile_air(
+            mlir_module,
             verbose=args.verbose,
-            omit_while_true_loop=False,
+            output_format=args.output_format,
             debug_ir=args.debug_ir,
             omit_pingpong="all",
             runtime_loop_tiling_sizes=[4, 4],
+            instance_name="bottleneck_block",
         )
         output_placeholder = np.zeros(expected_out.shape, expected_out.dtype)
-        expanded_inputs = [input_act_flat, total_wts, output_placeholder]
-
-        compiled_module = backend.compile(mlir_module)
+        runtime = get_air_runtime()
+        io_args = [
+            aie.utils.tensor(input_act_flat),
+            aie.utils.tensor(total_wts),
+            aie.utils.tensor(output_placeholder),
+        ]
+        handle = runtime.load(npu_kernel)
         with filelock.FileLock("/tmp/npu.lock"):
-            module_function = backend.load(compiled_module)
-            actual_outputs = module_function(*expanded_inputs)
-        backend.unload()
+            runtime.run(handle, io_args)
 
-        actual_out = actual_outputs[len([input_act_flat, total_wts])]
+        actual_out = io_args[len([input_act_flat, total_wts])].numpy()
 
         if compare_with_tolerance(actual_out, expected_out):
             print("PASS!")
@@ -1572,13 +1576,13 @@ def compare_with_tolerance(actual, expected):
 
     elif args.compile_mode == "compile-only":
         print("\nCompiling AIR bottleneck design (no execution)...")
-        backend = XRTBackend(
+        compile_air(
+            mlir_module,
             verbose=args.verbose,
-            omit_while_true_loop=False,
+            output_format=args.output_format,
             debug_ir=args.debug_ir,
-            omit_pingpong="all",  # Disable all ping-pong to avoid shared buffer sync issues,
+            omit_pingpong="all",
             runtime_loop_tiling_sizes=[4, 4],
+            instance_name="bottleneck_block",
         )
-        module_function = backend.compile(mlir_module)
-        backend.unload()
         print("Compilation successful!")
diff --git a/programming_examples/cascade_reduction/cascade_reduction.py b/programming_examples/cascade_reduction/cascade_reduction.py
index 10cb7ff92..089bad3ca 100644
--- a/programming_examples/cascade_reduction/cascade_reduction.py
+++ b/programming_examples/cascade_reduction/cascade_reduction.py
@@ -16,16 +16,13 @@
 Final result: output = input + 4
 """
 
-import argparse
-
 from air.ir import *
 from air.dialects.air import *
 from air.dialects import arith, linalg, memref, scf
 from air.dialects.memref import AllocOp
 from air.dialects.func import FuncOp
 from air.dialects.scf import for_, yield_
-from air.backend.xrt_runner import XRTRunner, type_mapper
-from air.backend.xrt import XRTBackend
+from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu
 
 import numpy as np
 
@@ -45,11 +42,7 @@ def build_module():
     # L3 types
     l3MemrefTy = MemRefType.get(data_shape, xrt_dtype)
     # L1 types
-    l1MemrefTy = MemRefType.get(
-        data_shape,
-        xrt_dtype,
-        memory_space=IntegerAttr.get(T.i32(), MemorySpace.L1),
-    )
+    l1MemrefTy = l1_memref_type(data_shape, xrt_dtype)
 
     # Channels: chan_in/chan_out use DMA (L3<->L1), chan_cascade uses
     # direct core-to-core cascade connections between adjacent tiles.
@@ -125,26 +118,7 @@ def herd_body(tx, ty, sx, sy):
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        prog="run.py",
-        description="Builds, runs, and tests the cascade reduction example",
-    )
-    parser.add_argument("-v", "--verbose", action="store_true")
-    parser.add_argument("-p", "--print-module-only", action="store_true")
-    parser.add_argument(
-        "--compile-mode",
-        type=str,
-        choices=["compile-only", "compile-and-run"],
-        dest="compile_mode",
-        default="compile-and-run",
-    )
-    parser.add_argument(
-        "--output-format",
-        type=str,
-        choices=["xclbin", "elf"],
-        default="xclbin",
-        dest="output_format",
-    )
+    parser = make_air_parser("Builds, runs, and tests the cascade reduction example")
     args = parser.parse_args()
 
     mlir_module = build_module()
@@ -154,48 +128,32 @@ def herd_body(tx, ty, sx, sy):
 
     input_a = np.arange(0, DATA_SIZE, dtype=np.int32).reshape(1, 1, DATA_SIZE)
 
-    if args.compile_mode == "compile-and-run":
-        num_samples = 100
-        sampled_indices = np.vstack(
-            [
-                np.zeros(num_samples, dtype=int),
-                np.zeros(num_samples, dtype=int),
-                np.random.randint(0, DATA_SIZE, num_samples),
-            ]
-        )
-
-        sampled_values = np.array(
-            [input_a[i, j, k] + NUM_TILES for i, j, k in zip(*sampled_indices)],
-            dtype=np.int32,
-        )
-
-        sampled_data = {
-            "shape": (1, 1, DATA_SIZE),
-            "indices": sampled_indices,
-            "values": sampled_values,
-        }
-
-        runner = XRTRunner(
-            verbose=args.verbose,
-            omit_while_true_loop=False,
-            output_format=args.output_format,
-            instance_name="cascade_reduce",
-            runtime_loop_tiling_sizes=[4, 4],
-        )
-        exit(
-            runner.run_test(
-                mlir_module,
-                inputs=[input_a],
-                stochastic_expected_outputs=[sampled_data],
-            )
-        )
-
-    elif args.compile_mode == "compile-only":
-        backend = XRTBackend(
-            verbose=args.verbose,
-            omit_while_true_loop=False,
-            output_format=args.output_format,
-            runtime_loop_tiling_sizes=[4, 4],
-        )
-        module_function = backend.compile(mlir_module)
-        backend.unload()
+    num_samples = 100
+    sampled_indices = np.vstack(
+        [
+            np.zeros(num_samples, dtype=int),
+            np.zeros(num_samples, dtype=int),
+            np.random.randint(0, DATA_SIZE, num_samples),
+        ]
+    )
+
+    sampled_values = np.array(
+        [input_a[i, j, k] + NUM_TILES for i, j, k in zip(*sampled_indices)],
+        dtype=np.int32,
+    )
+
+    sampled_data = {
+        "shape": (1, 1, DATA_SIZE),
+        "indices": sampled_indices,
+        "values": sampled_values,
+    }
+
+    run_on_npu(
+        args,
+        mlir_module,
+        inputs=[input_a],
+        stochastic_expected_outputs=[sampled_data],
+        instance_name="cascade_reduce",
+        omit_while_true_loop=False,
+        runtime_loop_tiling_sizes=[4, 4],
+    )
diff --git a/programming_examples/channel_examples/broadcast/multi_herd/broadcast.py b/programming_examples/channel_examples/broadcast/multi_herd/broadcast.py
index c79d4d63b..e77de9daa 100644
--- a/programming_examples/channel_examples/broadcast/multi_herd/broadcast.py
+++ b/programming_examples/channel_examples/broadcast/multi_herd/broadcast.py
@@ -1,6 +1,5 @@
 # Copyright (C) 2024, Advanced Micro Devices, Inc.
 # SPDX-License-Identifier: MIT
-import argparse
 import numpy as np
 
 from air.ir import *
@@ -8,7 +7,7 @@
 from air.dialects.memref import AllocOp, DeallocOp, load, store
 from air.dialects.func import FuncOp
 from air.dialects.scf import for_, yield_
-from air.backend.xrt_runner import XRTRunner, type_mapper
+from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu
 
 range_ = for_
 
@@ -26,12 +25,7 @@ def build_module():
     xrt_dtype = type_mapper(INOUT_DATATYPE)
     memrefTyInOut = MemRefType.get(IMAGE_SIZE, xrt_dtype)
 
-    mem_space_l1 = IntegerAttr.get(T.i32(), MemorySpace.L1)
-    image_type_l1 = MemRefType.get(
-        shape=IMAGE_SIZE,
-        element_type=xrt_dtype,
-        memory_space=mem_space_l1,
-    )
+    image_type_l1 = l1_memref_type(IMAGE_SIZE, xrt_dtype)
 
     Channel("ChanIn", size=[1, 1], broadcast_shape=[3, 1])
     for name in OUTPUT_HERD_NAMES:
@@ -87,27 +81,8 @@ def herd_body(_tx, _ty, _sx, _sy):
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        prog="run.py",
-        description="Builds, runs, and tests the channel broadcast multi herd example",
-    )
-    parser.add_argument(
-        "-v",
-        "--verbose",
-        action="store_true",
-    )
-    parser.add_argument(
-        "-p",
-        "--print-module-only",
-        action="store_true",
-    )
-    parser.add_argument(
-        "--output-format",
-        type=str,
-        choices=["xclbin", "elf"],
-        default="xclbin",
-        dest="output_format",
-        help="Output format for the compiled binary (default: xclbin)",
+    parser = make_air_parser(
+        "Builds, runs, and tests the channel broadcast multi herd example"
     )
 
     args = parser.parse_args()
@@ -128,16 +103,12 @@ def herd_body(_tx, _ty, _sx, _sy):
         IMAGE_SIZE
     )
 
-    runner = XRTRunner(
-        verbose=args.verbose,
-        output_format=args.output_format,
-        instance_name="copy",
-        runtime_loop_tiling_sizes=[4, 4],
-    )
     exit(
-        runner.run_test(
+        run_on_npu(
+            args,
             mlir_module,
             inputs=[input_a],
+            instance_name="copy",
             expected_outputs=[output_b, output_c, output_d],
         )
     )
diff --git a/programming_examples/channel_examples/broadcast/single_herd/broadcast.py b/programming_examples/channel_examples/broadcast/single_herd/broadcast.py
index 381db1c70..c7eab91e6 100644
--- a/programming_examples/channel_examples/broadcast/single_herd/broadcast.py
+++ b/programming_examples/channel_examples/broadcast/single_herd/broadcast.py
@@ -1,6 +1,5 @@
 # Copyright (C) 2024, Advanced Micro Devices, Inc.
 # SPDX-License-Identifier: MIT
-import argparse
 import numpy as np
 
 from air.ir import *
@@ -8,7 +7,7 @@
 from air.dialects.memref import AllocOp, DeallocOp, load, store
 from air.dialects.func import FuncOp
 from air.dialects.scf import for_, yield_
-from air.backend.xrt_runner import XRTRunner, type_mapper
+from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu
 
 range_ = for_
 
@@ -24,12 +23,7 @@ def build_module():
     xrt_dtype = type_mapper(INOUT_DATATYPE)
     memrefTyInOut = MemRefType.get(IMAGE_SIZE, xrt_dtype)
 
-    mem_space_l1 = IntegerAttr.get(T.i32(), MemorySpace.L1)
-    image_type_l1 = MemRefType.get(
-        shape=IMAGE_SIZE,
-        element_type=xrt_dtype,
-        memory_space=mem_space_l1,
-    )
+    image_type_l1 = l1_memref_type(IMAGE_SIZE, xrt_dtype)
 
     Channel("ChanIn", size=[1, 1], broadcast_shape=[1, 3])
     Channel("ChanOut", size=[1, 3])
@@ -81,27 +75,8 @@ def herd_body(tx, ty, _sx, _sy):
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        prog="run.py",
-        description="Builds, runs, and tests the channel broadcast multi herd example",
-    )
-    parser.add_argument(
-        "-v",
-        "--verbose",
-        action="store_true",
-    )
-    parser.add_argument(
-        "-p",
-        "--print-module-only",
-        action="store_true",
-    )
-    parser.add_argument(
-        "--output-format",
-        type=str,
-        choices=["xclbin", "elf"],
-        default="xclbin",
-        dest="output_format",
-        help="Output format for the compiled binary (default: xclbin)",
+    parser = make_air_parser(
+        "Builds, runs, and tests the channel broadcast multi herd example"
     )
 
     args = parser.parse_args()
@@ -122,16 +97,12 @@ def herd_body(tx, ty, _sx, _sy):
         IMAGE_SIZE
     )
 
-    runner = XRTRunner(
-        verbose=args.verbose,
-        output_format=args.output_format,
-        instance_name="copy",
-        runtime_loop_tiling_sizes=[4, 4],
-    )
     exit(
-        runner.run_test(
+        run_on_npu(
+            args,
             mlir_module,
             inputs=[input_a],
+            instance_name="copy",
             expected_outputs=[output_b, output_c, output_d],
         )
     )
diff --git a/programming_examples/channel_examples/broadcast_selective_capture/broadcast_selective_capture.py b/programming_examples/channel_examples/broadcast_selective_capture/broadcast_selective_capture.py
index a31182df5..c926baae1 100644
--- a/programming_examples/channel_examples/broadcast_selective_capture/broadcast_selective_capture.py
+++ b/programming_examples/channel_examples/broadcast_selective_capture/broadcast_selective_capture.py
@@ -18,7 +18,6 @@
 # The net effect is equivalent to a non-broadcast scatter, but implemented
 # over a single broadcast channel to conserve DMA channels.
 
-import argparse
 import numpy as np
 
 from air.ir import *
@@ -27,7 +26,7 @@
 from air.dialects.func import FuncOp
 from air.dialects import arith, scf
 from air.dialects.scf import for_, yield_
-from air.backend.xrt_runner import XRTRunner, type_mapper
+from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu
 
 range_ = for_
 
@@ -45,12 +44,7 @@ def build_module():
     memrefTyIn = MemRefType.get([total_size], xrt_dtype)
     memrefTyOut = MemRefType.get([total_size], xrt_dtype)
 
-    mem_space_l1 = IntegerAttr.get(T.i32(), MemorySpace.L1)
-    tile_type_l1 = MemRefType.get(
-        shape=[TILE_SIZE],
-        element_type=xrt_dtype,
-        memory_space=mem_space_l1,
-    )
+    tile_type_l1 = l1_memref_type([TILE_SIZE], xrt_dtype)
 
     # Broadcast channel: size [1, 1] broadcast to [1, NUM_TILES]
     # All cores in the herd receive the same data on each put.
@@ -132,27 +126,8 @@ def herd_body(tx, ty, _sx, _sy):
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        prog="run.py",
-        description="Builds, runs, and tests the broadcast selective capture example",
-    )
-    parser.add_argument(
-        "-v",
-        "--verbose",
-        action="store_true",
-    )
-    parser.add_argument(
-        "-p",
-        "--print-module-only",
-        action="store_true",
-    )
-    parser.add_argument(
-        "--output-format",
-        type=str,
-        choices=["xclbin", "elf"],
-        default="xclbin",
-        dest="output_format",
-        help="Output format for the compiled binary (default: xclbin)",
+    parser = make_air_parser(
+        "Builds, runs, and tests the broadcast selective capture example"
     )
 
     args = parser.parse_args()
@@ -175,16 +150,12 @@ def herd_body(tx, ty, _sx, _sy):
         end = start + TILE_SIZE
         expected_output[start:end] = input_a[start:end] + ty
 
-    runner = XRTRunner(
-        verbose=args.verbose,
-        output_format=args.output_format,
-        instance_name="broadcast_selective_capture",
-        runtime_loop_tiling_sizes=[4, 4],
-    )
     exit(
-        runner.run_test(
+        run_on_npu(
+            args,
             mlir_module,
             inputs=[input_a],
+            instance_name="broadcast_selective_capture",
             expected_outputs=[expected_output],
         )
     )
diff --git a/programming_examples/channel_examples/channel_3d_segment_unroll/channel_3d_segment_unroll.py b/programming_examples/channel_examples/channel_3d_segment_unroll/channel_3d_segment_unroll.py
index 2581f1652..d42ea2b5c 100644
--- a/programming_examples/channel_examples/channel_3d_segment_unroll/channel_3d_segment_unroll.py
+++ b/programming_examples/channel_examples/channel_3d_segment_unroll/channel_3d_segment_unroll.py
@@ -23,16 +23,13 @@
   output[seg, ty] = sum_{tx=0}^{3} input[seg, tx, ty]
 """
 
-import argparse
-
 from air.ir import *
 from air.dialects.air import *
 from air.dialects import arith, linalg, scf
 from air.dialects.memref import AllocOp
 from air.dialects.func import FuncOp
 from air.dialects.scf import for_, yield_
-from air.backend.xrt_runner import XRTRunner, type_mapper
-from air.backend.xrt import XRTBackend
+from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu
 
 import numpy as np
 
@@ -59,11 +56,7 @@ def build_module():
     l3MemrefTyOut = MemRefType.get([TOTAL_OUT], xrt_dtype)
 
     # L1 type: one tile per core
-    l1MemrefTy = MemRefType.get(
-        tile_shape,
-        xrt_dtype,
-        memory_space=IntegerAttr.get(T.i32(), MemorySpace.L1),
-    )
+    l1MemrefTy = l1_memref_type(tile_shape, xrt_dtype)
 
     # 3D input channel: [NUM_SEGMENTS, NUM_TILES, NUM_COLS].
     # Each core (seg, tx, ty) gets its own unique tile.
@@ -193,18 +186,8 @@ def herd_body(tx, ty, sx, sy, herd_seg_x):
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        prog="run.py",
-        description="Builds, runs, and tests the 3D channel with segment unroll example",
-    )
-    parser.add_argument("-v", "--verbose", action="store_true")
-    parser.add_argument("-p", "--print-module-only", action="store_true")
-    parser.add_argument(
-        "--output-format",
-        type=str,
-        choices=["xclbin", "elf"],
-        default="xclbin",
-        dest="output_format",
+    parser = make_air_parser(
+        "Builds, runs, and tests the 3D channel with segment unroll example"
     )
     args = parser.parse_args()
 
@@ -231,17 +214,12 @@ def herd_body(tx, ty, sx, sy, herd_seg_x):
                     in_start : in_start + DATA_SIZE
                 ]
 
-    runner = XRTRunner(
-        verbose=args.verbose,
-        omit_while_true_loop=False,
-        output_format=args.output_format,
-        instance_name="channel_3d_segment_unroll",
-        runtime_loop_tiling_sizes=[4, 4],
-    )
     exit(
-        runner.run_test(
+        run_on_npu(
+            args,
             mlir_module,
             inputs=[input_a],
+            instance_name="channel_3d_segment_unroll",
             expected_outputs=[expected_output],
         )
     )
diff --git a/programming_examples/channel_examples/channel_size/channel_size.py b/programming_examples/channel_examples/channel_size/channel_size.py
index 0741f223d..e25ff299c 100644
--- a/programming_examples/channel_examples/channel_size/channel_size.py
+++ b/programming_examples/channel_examples/channel_size/channel_size.py
@@ -1,6 +1,5 @@
 # Copyright (C) 2024, Advanced Micro Devices, Inc.
 # SPDX-License-Identifier: MIT
-import argparse
 import numpy as np
 
 np.random.seed(42)
@@ -10,7 +9,7 @@
 from air.dialects.memref import AllocOp, DeallocOp, load, store
 from air.dialects.func import FuncOp
 from air.dialects.scf import for_, yield_
-from air.backend.xrt_runner import XRTRunner, type_mapper
+from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu
 
 range_ = for_
 
@@ -86,15 +85,8 @@ def segment_body():
                 )
                 def herd_body(th, tw, _sx, _sy):
 
-                    # We want to store our data in L1 memory
-                    mem_space = IntegerAttr.get(T.i32(), MemorySpace.L1)
-
                     # This is the type definition of the tile
-                    tile_type = MemRefType.get(
-                        shape=TILE_SIZE,
-                        element_type=xrt_dtype,
-                        memory_space=mem_space,
-                    )
+                    tile_type = l1_memref_type(TILE_SIZE, xrt_dtype)
 
                     # We must allocate a buffer of tile size for the input/output
                     tile_in = AllocOp(tile_type, [], [])
@@ -123,28 +115,7 @@ def herd_body(th, tw, _sx, _sy):
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        prog="run.py",
-        description="Builds, runs, and tests the channel_size example",
-    )
-    parser.add_argument(
-        "-v",
-        "--verbose",
-        action="store_true",
-    )
-    parser.add_argument(
-        "-p",
-        "--print-module-only",
-        action="store_true",
-    )
-    parser.add_argument(
-        "--output-format",
-        type=str,
-        choices=["xclbin", "elf"],
-        default="xclbin",
-        dest="output_format",
-        help="Output format for the compiled binary (default: xclbin)",
-    )
+    parser = make_air_parser("Builds, runs, and tests the channel_size example")
 
     args = parser.parse_args()
 
@@ -161,14 +132,12 @@ def herd_body(th, tw, _sx, _sy):
     )
     output_matrix = input_matrix.copy()
 
-    runner = XRTRunner(
-        verbose=args.verbose,
-        output_format=args.output_format,
-        instance_name="copy",
-        runtime_loop_tiling_sizes=[4, 4],
-    )
     exit(
-        runner.run_test(
-            mlir_module, inputs=[input_matrix], expected_outputs=[output_matrix]
+        run_on_npu(
+            args,
+            mlir_module,
+            inputs=[input_matrix],
+            instance_name="copy",
+            expected_outputs=[output_matrix],
         )
     )
diff --git a/programming_examples/channel_examples/herd_to_herd/multi_segment/herd_to_herd.py b/programming_examples/channel_examples/herd_to_herd/multi_segment/herd_to_herd.py
index bb46f20a5..ef5b21bbb 100644
--- a/programming_examples/channel_examples/herd_to_herd/multi_segment/herd_to_herd.py
+++ b/programming_examples/channel_examples/herd_to_herd/multi_segment/herd_to_herd.py
@@ -1,6 +1,5 @@
 # Copyright (C) 2024, Advanced Micro Devices, Inc.
 # SPDX-License-Identifier: MIT
-import argparse
 import numpy as np
 
 from air.ir import *
@@ -8,7 +7,7 @@
 from air.dialects.memref import AllocOp, DeallocOp, load, store
 from air.dialects.func import FuncOp
 from air.dialects.scf import for_, yield_
-from air.backend.xrt_runner import XRTRunner, type_mapper
+from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu
 
 range_ = for_
 
@@ -24,15 +23,8 @@ def build_module():
     xrt_dtype = type_mapper(INOUT_DATATYPE)
     memrefTyInOut = MemRefType.get(IMAGE_SIZE, xrt_dtype)
 
-    # We want to store our data in L1 memory
-    mem_space_l1 = IntegerAttr.get(T.i32(), MemorySpace.L1)
-
     # This is the type definition of the tile
-    image_type_l1 = MemRefType.get(
-        shape=IMAGE_SIZE,
-        element_type=xrt_dtype,
-        memory_space=mem_space_l1,
-    )
+    image_type_l1 = l1_memref_type(IMAGE_SIZE, xrt_dtype)
 
     # Create two channels which will send/receive the
     # input/output data respectively
@@ -120,28 +112,7 @@ def herd_body(tx, ty, sx, sy):
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        prog="run.py",
-        description="Builds, runs, and tests the herd_to_herd channel example",
-    )
-    parser.add_argument(
-        "-v",
-        "--verbose",
-        action="store_true",
-    )
-    parser.add_argument(
-        "-p",
-        "--print-module-only",
-        action="store_true",
-    )
-    parser.add_argument(
-        "--output-format",
-        type=str,
-        choices=["xclbin", "elf"],
-        default="xclbin",
-        dest="output_format",
-        help="Output format for the compiled binary (default: xclbin)",
-    )
+    parser = make_air_parser("Builds, runs, and tests the herd_to_herd channel example")
 
     args = parser.parse_args()
 
@@ -153,10 +124,12 @@ def herd_body(tx, ty, sx, sy):
     input_a = np.full(IMAGE_SIZE, 0x2, dtype=INOUT_DATATYPE)
     output_b = np.full(IMAGE_SIZE, 0x5, dtype=INOUT_DATATYPE)
 
-    runner = XRTRunner(
-        verbose=args.verbose,
-        output_format=args.output_format,
-        instance_name="copy",
-        runtime_loop_tiling_sizes=[4, 4],
+    exit(
+        run_on_npu(
+            args,
+            mlir_module,
+            inputs=[input_a],
+            instance_name="copy",
+            expected_outputs=[output_b],
+        )
     )
-    exit(runner.run_test(mlir_module, inputs=[input_a], expected_outputs=[output_b]))
diff --git a/programming_examples/channel_examples/herd_to_herd/single_segment/herd_to_herd.py b/programming_examples/channel_examples/herd_to_herd/single_segment/herd_to_herd.py
index 65c77f0ac..0ea651a79 100644
--- a/programming_examples/channel_examples/herd_to_herd/single_segment/herd_to_herd.py
+++ b/programming_examples/channel_examples/herd_to_herd/single_segment/herd_to_herd.py
@@ -1,6 +1,5 @@
 # Copyright (C) 2024, Advanced Micro Devices, Inc.
 # SPDX-License-Identifier: MIT
-import argparse
 import numpy as np
 
 from air.ir import *
@@ -9,7 +8,7 @@
 from air.dialects.memref import AllocOp, DeallocOp, load, store
 from air.dialects.func import FuncOp
 from air.dialects.scf import for_, yield_
-from air.backend.xrt_runner import XRTRunner, type_mapper
+from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu
 
 range_ = for_
 
@@ -41,15 +40,8 @@ def build_module():
     xrt_dtype = type_mapper(INOUT_DATATYPE)
     memrefTyInOut = MemRefType.get(IMAGE_SIZE, xrt_dtype)
 
-    # We want to store our data in L1 memory
-    mem_space_l1 = IntegerAttr.get(T.i32(), MemorySpace.L1)
-
     # This is the type definition of the tile
-    image_type_l1 = MemRefType.get(
-        shape=IMAGE_SIZE,
-        element_type=xrt_dtype,
-        memory_space=mem_space_l1,
-    )
+    image_type_l1 = l1_memref_type(IMAGE_SIZE, xrt_dtype)
 
     # Create two channels which will send/receive the
     # input/output data respectively
@@ -128,28 +120,7 @@ def herd_body(tx, ty, sx, sy):
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        prog="run.py",
-        description="Builds, runs, and tests the herd_to_herd channel example",
-    )
-    parser.add_argument(
-        "-v",
-        "--verbose",
-        action="store_true",
-    )
-    parser.add_argument(
-        "-p",
-        "--print-module-only",
-        action="store_true",
-    )
-    parser.add_argument(
-        "--output-format",
-        type=str,
-        choices=["xclbin", "elf"],
-        default="xclbin",
-        dest="output_format",
-        help="Output format for the compiled binary (default: xclbin)",
-    )
+    parser = make_air_parser("Builds, runs, and tests the herd_to_herd channel example")
 
     args = parser.parse_args()
 
@@ -161,10 +132,12 @@ def herd_body(tx, ty, sx, sy):
     input_a = np.full(IMAGE_SIZE, 0x2, dtype=INOUT_DATATYPE)
     output_b = np.full(IMAGE_SIZE, 0x5, dtype=INOUT_DATATYPE)
 
-    runner = XRTRunner(
-        verbose=args.verbose,
-        output_format=args.output_format,
-        instance_name="copy",
-        runtime_loop_tiling_sizes=[4, 4],
+    exit(
+        run_on_npu(
+            args,
+            mlir_module,
+            inputs=[input_a],
+            instance_name="copy",
+            expected_outputs=[output_b],
+        )
     )
-    exit(runner.run_test(mlir_module, inputs=[input_a], expected_outputs=[output_b]))
diff --git a/programming_examples/channel_examples/hierarchical/hierarchical.py b/programming_examples/channel_examples/hierarchical/hierarchical.py
index 23752159b..380b975dd 100644
--- a/programming_examples/channel_examples/hierarchical/hierarchical.py
+++ b/programming_examples/channel_examples/hierarchical/hierarchical.py
@@ -1,6 +1,5 @@
 # Copyright (C) 2024, Advanced Micro Devices, Inc.
 # SPDX-License-Identifier: MIT
-import argparse
 import numpy as np
 
 from air.ir import *
@@ -8,7 +7,7 @@
 from air.dialects.memref import AllocOp, DeallocOp, load, store
 from air.dialects.func import FuncOp
 from air.dialects.scf import for_, yield_
-from air.backend.xrt_runner import XRTRunner, type_mapper
+from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu
 
 range_ = for_
 
@@ -24,19 +23,8 @@ def build_module():
     xrt_dtype = type_mapper(INOUT_DATATYPE)
     memrefTyInOut = MemRefType.get(IMAGE_SIZE, xrt_dtype)
 
-    mem_space_l1 = IntegerAttr.get(T.i32(), MemorySpace.L1)
-    mem_space_l2 = IntegerAttr.get(T.i32(), MemorySpace.L2)
-
-    image_type_l1 = MemRefType.get(
-        shape=IMAGE_SIZE,
-        element_type=xrt_dtype,
-        memory_space=mem_space_l1,
-    )
-    image_type_l2 = MemRefType.get(
-        shape=IMAGE_SIZE,
-        element_type=xrt_dtype,
-        memory_space=mem_space_l2,
-    )
+    image_type_l1 = l1_memref_type(IMAGE_SIZE, xrt_dtype)
+    image_type_l2 = l2_memref_type(IMAGE_SIZE, xrt_dtype)
 
     Channel("ChanInL2")
     Channel("ChanOutL2")
@@ -96,28 +84,7 @@ def herd_body(tx, ty, sx, sy):
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        prog="run.py",
-        description="Builds, runs, and tests the channel hierarchical example",
-    )
-    parser.add_argument(
-        "-v",
-        "--verbose",
-        action="store_true",
-    )
-    parser.add_argument(
-        "-p",
-        "--print-module-only",
-        action="store_true",
-    )
-    parser.add_argument(
-        "--output-format",
-        type=str,
-        choices=["xclbin", "elf"],
-        default="xclbin",
-        dest="output_format",
-        help="Output format for the compiled binary (default: xclbin)",
-    )
+    parser = make_air_parser("Builds, runs, and tests the channel hierarchical example")
 
     args = parser.parse_args()
 
@@ -133,14 +100,12 @@ def herd_body(tx, ty, sx, sy):
         IMAGE_SIZE
     )
 
-    runner = XRTRunner(
-        verbose=args.verbose,
-        output_format=args.output_format,
-        instance_name="copy",
-        runtime_loop_tiling_sizes=[4, 4],
-    )
     exit(
-        runner.run_test(
-            mlir_module, inputs=[input_matrix], expected_outputs=[output_matrix]
+        run_on_npu(
+            args,
+            mlir_module,
+            inputs=[input_matrix],
+            instance_name="copy",
+            expected_outputs=[output_matrix],
         )
     )
diff --git a/programming_examples/channel_examples/worker_to_self/worker_to_self.py b/programming_examples/channel_examples/worker_to_self/worker_to_self.py
index 196cf6a88..847a57f17 100644
--- a/programming_examples/channel_examples/worker_to_self/worker_to_self.py
+++ b/programming_examples/channel_examples/worker_to_self/worker_to_self.py
@@ -1,6 +1,5 @@
 # Copyright (C) 2024, Advanced Micro Devices, Inc.
 # SPDX-License-Identifier: MIT
-import argparse
 import numpy as np
 
 from air.ir import *
@@ -8,7 +7,7 @@
 from air.dialects.memref import AllocOp, DeallocOp, load, store
 from air.dialects.func import FuncOp
 from air.dialects.scf import for_, yield_
-from air.backend.xrt_runner import XRTRunner, type_mapper
+from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu
 
 range_ = for_
 
@@ -29,19 +28,8 @@ def build_module():
     Channel("ChanOut")
     Channel("ToSelf")
 
-    mem_space_l1 = IntegerAttr.get(T.i32(), MemorySpace.L1)
-    image_type_l1 = MemRefType.get(
-        shape=IMAGE_SIZE,
-        element_type=xrt_dtype,
-        memory_space=mem_space_l1,
-    )
-
-    mem_space_l2 = IntegerAttr.get(T.i32(), MemorySpace.L2)
-    image_type_l2 = MemRefType.get(
-        shape=IMAGE_SIZE,
-        element_type=xrt_dtype,
-        memory_space=mem_space_l2,
-    )
+    image_type_l1 = l1_memref_type(IMAGE_SIZE, xrt_dtype)
+    image_type_l2 = l2_memref_type(IMAGE_SIZE, xrt_dtype)
 
     # We will send an image worth of data in and out
     @FuncOp.from_py_func(memrefTyInOut, memrefTyInOut)
@@ -97,27 +85,8 @@ def herd_body(tx, ty, sx, sy, tensor_in_l2):
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        prog="run.py",
-        description="Builds, runs, and tests the channel worker_to_self example",
-    )
-    parser.add_argument(
-        "-v",
-        "--verbose",
-        action="store_true",
-    )
-    parser.add_argument(
-        "-p",
-        "--print-module-only",
-        action="store_true",
-    )
-    parser.add_argument(
-        "--output-format",
-        type=str,
-        choices=["xclbin", "elf"],
-        default="xclbin",
-        dest="output_format",
-        help="Output format for the compiled binary (default: xclbin)",
+    parser = make_air_parser(
+        "Builds, runs, and tests the channel worker_to_self example"
     )
 
     args = parser.parse_args()
@@ -132,14 +101,12 @@ def herd_body(tx, ty, sx, sy, tensor_in_l2):
     )
     output_matrix = input_matrix.copy()
 
-    runner = XRTRunner(
-        verbose=args.verbose,
-        output_format=args.output_format,
-        instance_name="copy",
-        runtime_loop_tiling_sizes=[4, 4],
-    )
     exit(
-        runner.run_test(
-            mlir_module, inputs=[input_matrix], expected_outputs=[output_matrix]
+        run_on_npu(
+            args,
+            mlir_module,
+            inputs=[input_matrix],
+            instance_name="copy",
+            expected_outputs=[output_matrix],
         )
     )
diff --git a/programming_examples/channel_examples/worker_to_worker/worker_to_worker.py b/programming_examples/channel_examples/worker_to_worker/worker_to_worker.py
index e49131fc7..6a756b310 100644
--- a/programming_examples/channel_examples/worker_to_worker/worker_to_worker.py
+++ b/programming_examples/channel_examples/worker_to_worker/worker_to_worker.py
@@ -1,6 +1,5 @@
 # Copyright (C) 2024, Advanced Micro Devices, Inc.
 # SPDX-License-Identifier: MIT
-import argparse
 import numpy as np
 
 from air.ir import *
@@ -9,7 +8,7 @@
 from air.dialects.func import FuncOp
 from air.dialects.scf import for_, yield_
 from air.dialects.affine import apply as affine_apply
-from air.backend.xrt_runner import XRTRunner, type_mapper
+from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu
 
 range_ = for_
 
@@ -138,15 +137,8 @@ def herd_body(th, tw, sh, sw):
                     )
                     th_next = affine_apply(get_tile_height_next, [tw, sw, th, sh])
 
-                    # We want to store our data in L1 memory
-                    mem_space = IntegerAttr.get(T.i32(), MemorySpace.L1)
-
                     # This is the type definition of the tile
-                    tile_type = MemRefType.get(
-                        shape=TILE_SIZE,
-                        element_type=xrt_dtype,
-                        memory_space=mem_space,
-                    )
+                    tile_type = l1_memref_type(TILE_SIZE, xrt_dtype)
 
                     # We must allocate a buffer of tile size for the input/output
                     tile_in = AllocOp(tile_type, [], [])
@@ -195,27 +187,8 @@ def herd_body(th, tw, sh, sw):
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        prog="run.py",
-        description="Builds, runs, and tests the channel worker_to_worker example",
-    )
-    parser.add_argument(
-        "-v",
-        "--verbose",
-        action="store_true",
-    )
-    parser.add_argument(
-        "-p",
-        "--print-module-only",
-        action="store_true",
-    )
-    parser.add_argument(
-        "--output-format",
-        type=str,
-        choices=["xclbin", "elf"],
-        default="xclbin",
-        dest="output_format",
-        help="Output format for the compiled binary (default: xclbin)",
+    parser = make_air_parser(
+        "Builds, runs, and tests the channel worker_to_worker example"
     )
 
     args = parser.parse_args()
@@ -257,14 +230,12 @@ def get_next_tile_num(tile_height, tile_width):
                 input_matrix[i, j] + tile_num_map[(i // TILE_HEIGHT, j // TILE_WIDTH)]
             )
 
-    runner = XRTRunner(
-        verbose=args.verbose,
-        output_format=args.output_format,
-        instance_name="copy",
-        runtime_loop_tiling_sizes=[4, 4],
-    )
     exit(
-        runner.run_test(
-            mlir_module, inputs=[input_matrix], expected_outputs=[output_matrix]
+        run_on_npu(
+            args,
+            mlir_module,
+            inputs=[input_matrix],
+            instance_name="copy",
+            expected_outputs=[output_matrix],
         )
     )
diff --git a/programming_examples/conditional_branching/single_core.py b/programming_examples/conditional_branching/single_core.py
index 5645c2986..1aa79eabf 100644
--- a/programming_examples/conditional_branching/single_core.py
+++ b/programming_examples/conditional_branching/single_core.py
@@ -1,6 +1,5 @@
 # Copyright (C) 2025, Advanced Micro Devices, Inc.
 # SPDX-License-Identifier: MIT
-import argparse
 from math import cos, sin
 
 from air.ir import *
@@ -10,8 +9,7 @@
 from air.dialects.func import FuncOp, CallOp
 from air.dialects import scf
 from air.dialects.scf import for_, yield_
-from air.backend.xrt_runner import XRTRunner, type_mapper
-from air.backend.xrt import XRTBackend
+from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu
 
 range_ = for_
 
@@ -47,17 +45,8 @@ def segment_body(
                 _l3_out_data,
             ):
                 # L2 MemRefTypes
-                l2_mem_space = IntegerAttr.get(T.i32(), MemorySpace.L2)
-                l2MemrefTyIn = MemRefType.get(
-                    shape=[n],
-                    element_type=xrt_dtype_in,
-                    memory_space=l2_mem_space,
-                )
-                l2MemrefTyOut = MemRefType.get(
-                    shape=[n],
-                    element_type=xrt_dtype_in,
-                    memory_space=l2_mem_space,
-                )
+                l2MemrefTyIn = l2_memref_type([n], xrt_dtype_in)
+                l2MemrefTyOut = l2_memref_type([n], xrt_dtype_in)
                 l2_in_data = AllocOp(l2MemrefTyIn, [], [])
                 l2_out_data = AllocOp(l2MemrefTyOut, [], [])
                 dma_memcpy_nd(
@@ -76,17 +65,8 @@ def herd_body_0(
                     _tx, _ty, _sx, _sy, _l2_in_data, _l2_out_data, _param_arg
                 ):
 
-                    l1_mem_space = IntegerAttr.get(T.i32(), MemorySpace.L1)
-                    l1MemrefTyIn = MemRefType.get(
-                        shape=[n],
-                        element_type=xrt_dtype_in,
-                        memory_space=l1_mem_space,
-                    )
-                    l1MemrefTyOut = MemRefType.get(
-                        shape=[n],
-                        element_type=xrt_dtype_in,
-                        memory_space=l1_mem_space,
-                    )
+                    l1MemrefTyIn = l1_memref_type([n], xrt_dtype_in)
+                    l1MemrefTyOut = l1_memref_type([n], xrt_dtype_in)
 
                     l1_in_data = AllocOp(l1MemrefTyIn, [], [])
                     dma_memcpy_nd(
@@ -141,34 +121,13 @@ def herd_body_0(
     INPUT_DATATYPE = np.int32
     OUTPUT_DATATYPE = np.int32
 
-    parser = argparse.ArgumentParser(
-        prog="run.py",
-        description="Builds, runs, and tests the passthrough_dma example",
-    )
-    parser.add_argument(
-        "-v",
-        "--verbose",
-        action="store_true",
-    )
-    parser.add_argument(
-        "-p",
-        "--print-module-only",
-        action="store_true",
-    )
+    parser = make_air_parser("Builds, runs, and tests the passthrough_dma example")
     parser.add_argument(
         "--n",
         type=int,
         default=N,
         help="N dimension size in a (1xK) * (KxN) matmul",
     )
-    parser.add_argument(
-        "--output-format",
-        type=str,
-        choices=["xclbin", "elf"],
-        default="xclbin",
-        dest="output_format",
-        help="Output format for the compiled binary (default: xclbin)",
-    )
 
     args = parser.parse_args()
 
@@ -188,16 +147,13 @@ def herd_body_0(
     inputs = np.arange(0, args.n, dtype=INPUT_DATATYPE).reshape(args.n)
     outputs = inputs * 100
 
-    runner = XRTRunner(
-        verbose=args.verbose,
-        output_format=args.output_format,
-        instance_name="conditional_branch",
-        runtime_loop_tiling_sizes=[4, 4],
-    )
-    res0 = runner.run_test(
+    res0 = run_on_npu(
+        args,
         mlir_module,
         inputs=[inputs],
+        instance_name="conditional_branch",
         expected_outputs=[outputs],
+        runtime_loop_tiling_sizes=[4, 4],
     )
 
     ###### Compile and test, param = 1
@@ -211,10 +167,13 @@ def herd_body_0(
     )
 
     outputs = inputs + 100
-    res1 = runner.run_test(
+    res1 = run_on_npu(
+        args,
         mlir_module,
         inputs=[inputs],
+        instance_name="conditional_branch",
         expected_outputs=[outputs],
+        runtime_loop_tiling_sizes=[4, 4],
     )
     if res0 == 0 and res1 == 0:
         print("Both conditions PASS!")
diff --git a/programming_examples/conv2d/conv2d.py b/programming_examples/conv2d/conv2d.py
index af591c6a3..cdbab2b47 100644
--- a/programming_examples/conv2d/conv2d.py
+++ b/programming_examples/conv2d/conv2d.py
@@ -18,16 +18,14 @@
   3. DMA output tile from L1 to L3
 """
 
-import argparse
-
+import numpy as np
 from air.ir import *
 from air.dialects.air import *
 from air.dialects import arith
 from air.dialects.memref import AllocOp, DeallocOp, load, store
 from air.dialects.func import FuncOp
 from air.dialects.scf import for_, yield_
-from air.backend.xrt_runner import XRTRunner, type_mapper
-from air.backend.xrt import XRTBackend
+from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu
 
 range_ = for_
 
@@ -56,10 +54,9 @@ def build_module(H, W, Ci, Co, Kh, Kw, np_dtype):
 
     # L1 types: drop the batch dimension (N=1) since we process one
     # sample. The DMA copies the full extent which matches because N=1.
-    l1_mem_space = IntegerAttr.get(T.i32(), MemorySpace.L1)
-    l1InTy = MemRefType.get([H, W, Ci], xrt_dtype, memory_space=l1_mem_space)
-    l1FilterTy = MemRefType.get([Kh, Kw, Ci, Co], xrt_dtype, memory_space=l1_mem_space)
-    l1OutTy = MemRefType.get([Ho, Wo, Co], xrt_dtype, memory_space=l1_mem_space)
+    l1InTy = l1_memref_type([H, W, Ci], xrt_dtype)
+    l1FilterTy = l1_memref_type([Kh, Kw, Ci, Co], xrt_dtype)
+    l1OutTy = l1_memref_type([Ho, Wo, Co], xrt_dtype)
 
     @FuncOp.from_py_func(l3InTy, l3FilterTy, l3OutTy)
     def conv2d(arg_in, arg_filter, arg_out):
@@ -122,30 +119,11 @@ def herd_body(_tx, _ty, _sx, _sy, l3_in, l3_filter, l3_out):
 if __name__ == "__main__":
     INPUT_DATATYPE = np.int32
 
-    parser = argparse.ArgumentParser(
-        prog="run.py",
-        description="Builds, runs, and tests the 2D convolution example",
-    )
-    parser.add_argument("-v", "--verbose", action="store_true")
-    parser.add_argument("-p", "--print-module-only", action="store_true")
+    parser = make_air_parser("Builds, runs, and tests the 2D convolution example")
     parser.add_argument("--H", type=int, default=H_DEFAULT, help="Input height")
     parser.add_argument("--W", type=int, default=W_DEFAULT, help="Input width")
     parser.add_argument("--Ci", type=int, default=CI_DEFAULT, help="Input channels")
     parser.add_argument("--Co", type=int, default=CO_DEFAULT, help="Output channels")
-    parser.add_argument(
-        "--compile-mode",
-        type=str,
-        choices=["compile-only", "compile-and-run"],
-        dest="compile_mode",
-        default="compile-and-run",
-    )
-    parser.add_argument(
-        "--output-format",
-        type=str,
-        choices=["xclbin", "elf"],
-        default="xclbin",
-        dest="output_format",
-    )
     args = parser.parse_args()
 
     Ho = args.H - KH + 1
@@ -177,28 +155,12 @@ def herd_body(_tx, _ty, _sx, _sy, l3_in, l3_filter, l3_out):
                                 * filter_data[kh, kw, ci, co]
                             )
 
-    if args.compile_mode == "compile-and-run":
-        runner = XRTRunner(
-            verbose=args.verbose,
-            omit_while_true_loop=False,
-            output_format=args.output_format,
-            instance_name="conv2d",
-            runtime_loop_tiling_sizes=[4, 4],
-        )
-        exit(
-            runner.run_test(
-                mlir_module,
-                inputs=[input_data, filter_data],
-                expected_outputs=[output_ref],
-            )
-        )
-
-    elif args.compile_mode == "compile-only":
-        backend = XRTBackend(
-            verbose=args.verbose,
-            omit_while_true_loop=False,
-            output_format=args.output_format,
-            runtime_loop_tiling_sizes=[4, 4],
-        )
-        module_function = backend.compile(mlir_module)
-        backend.unload()
+    run_on_npu(
+        args,
+        mlir_module,
+        inputs=[input_data, filter_data],
+        expected_outputs=[output_ref],
+        instance_name="conv2d",
+        omit_while_true_loop=False,
+        runtime_loop_tiling_sizes=[4, 4],
+    )
diff --git a/programming_examples/data_transfer_transpose/channel/transpose.py b/programming_examples/data_transfer_transpose/channel/transpose.py
index d27e74b88..1d3b2ae12 100644
--- a/programming_examples/data_transfer_transpose/channel/transpose.py
+++ b/programming_examples/data_transfer_transpose/channel/transpose.py
@@ -1,6 +1,5 @@
 # Copyright (C) 2024, Advanced Micro Devices, Inc.
 # SPDX-License-Identifier: MIT
-import argparse
 import numpy as np
 
 np.random.seed(42)
@@ -9,7 +8,7 @@
 from air.dialects.air import *
 from air.dialects.memref import AllocOp, DeallocOp
 from air.dialects.func import FuncOp
-from air.backend.xrt_runner import XRTRunner, type_mapper
+from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu
 
 dtype_map = {
     "uint32": np.uint32,
@@ -43,15 +42,10 @@ def launch_body(a, b):
             def segment_body():
                 @herd(name="herd", sizes=[1, 1])
                 def herd_body(_tx, _ty, _sx, _sy):
-                    # We want to store our data in L1 memory
-                    mem_space = IntegerAttr.get(T.i32(), MemorySpace.L1)
-
                     # This is the type definition of the tensor
-                    tensor_type = MemRefType.get(
-                        shape=[k * m],  # Read as one large array
-                        element_type=xrt_dtype,
-                        memory_space=mem_space,
-                    )
+                    tensor_type = l1_memref_type(
+                        [k * m], xrt_dtype
+                    )  # Read as one large array
 
                     # We must allocate a buffer of tile size for the input/output
                     tensor_in = AllocOp(tensor_type, [], [])
@@ -63,15 +57,8 @@ def herd_body(_tx, _ty, _sx, _sy):
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        prog="run.py",
-        description="Builds, runs, and tests the matrix_scalar_add/single_core_channel example",
-    )
-
-    parser.add_argument(
-        "-v",
-        "--verbose",
-        action="store_true",
+    parser = make_air_parser(
+        "Builds, runs, and tests the matrix_scalar_add/single_core_channel example"
     )
     parser.add_argument(
         "-m",
@@ -92,19 +79,6 @@ def herd_body(_tx, _ty, _sx, _sy):
         choices=dtype_map.keys(),
         help="The data type of the matrix",
     )
-    parser.add_argument(
-        "-p",
-        "--print-module-only",
-        action="store_true",
-    )
-    parser.add_argument(
-        "--output-format",
-        type=str,
-        choices=["xclbin", "elf"],
-        default="xclbin",
-        dest="output_format",
-        help="Output format for the compiled binary (default: xclbin)",
-    )
 
     args = parser.parse_args()
 
@@ -136,16 +110,12 @@ def herd_body(_tx, _ty, _sx, _sy):
         )
     expected_output_matrix = np.transpose(input_matrix)
 
-    runner = XRTRunner(
-        verbose=args.verbose,
-        output_format=args.output_format,
-        instance_name="transpose",
-        runtime_loop_tiling_sizes=[4, 4],
-    )
     exit(
-        runner.run_test(
+        run_on_npu(
+            args,
             mlir_module,
             inputs=[input_matrix],
+            instance_name="transpose",
             expected_outputs=[expected_output_matrix],
         )
     )
diff --git a/programming_examples/data_transfer_transpose/dma/transpose.py b/programming_examples/data_transfer_transpose/dma/transpose.py
index 094e4a0c2..b5c7a9ffd 100644
--- a/programming_examples/data_transfer_transpose/dma/transpose.py
+++ b/programming_examples/data_transfer_transpose/dma/transpose.py
@@ -1,6 +1,5 @@
 # Copyright (C) 2024, Advanced Micro Devices, Inc.
 # SPDX-License-Identifier: MIT
-import argparse
 import numpy as np
 
 np.random.seed(42)
@@ -9,7 +8,7 @@
 from air.dialects.air import *
 from air.dialects.memref import AllocOp, DeallocOp
 from air.dialects.func import FuncOp
-from air.backend.xrt_runner import XRTRunner, type_mapper
+from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu
 
 dtype_map = {
     "uint32": np.uint32,
@@ -34,15 +33,10 @@ def launch_body(a, b):
             def segment_body(arg2, arg3):
                 @herd(name="herd", sizes=[1, 1], operands=[arg2, arg3])
                 def herd_body(_tx, _ty, _sx, _sy, a, b):
-                    # We want to store our data in L1 memory
-                    mem_space = IntegerAttr.get(T.i32(), MemorySpace.L1)
-
                     # This is the type definition of the tensor
-                    tensor_type = MemRefType.get(
-                        shape=[m * k],  # Read as one large array
-                        element_type=xrt_dtype,
-                        memory_space=mem_space,
-                    )
+                    tensor_type = l1_memref_type(
+                        [m * k], xrt_dtype
+                    )  # Read as one large array
 
                     # We must allocate a buffer of tile size for the input/output
                     tensor_in = AllocOp(tensor_type, [], [])
@@ -69,15 +63,8 @@ def herd_body(_tx, _ty, _sx, _sy, a, b):
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        prog="run.py",
-        description="Builds, runs, and tests the matrix_scalar_add/single_core_channel example",
-    )
-
-    parser.add_argument(
-        "-v",
-        "--verbose",
-        action="store_true",
+    parser = make_air_parser(
+        "Builds, runs, and tests the matrix_scalar_add/single_core_channel example"
     )
     parser.add_argument(
         "-m",
@@ -98,19 +85,6 @@ def herd_body(_tx, _ty, _sx, _sy, a, b):
         choices=dtype_map.keys(),
         help="The data type of the matrix",
     )
-    parser.add_argument(
-        "-p",
-        "--print-module-only",
-        action="store_true",
-    )
-    parser.add_argument(
-        "--output-format",
-        type=str,
-        choices=["xclbin", "elf"],
-        default="xclbin",
-        dest="output_format",
-        help="Output format for the compiled binary (default: xclbin)",
-    )
 
     args = parser.parse_args()
 
@@ -142,16 +116,12 @@ def herd_body(_tx, _ty, _sx, _sy, a, b):
         )
     expected_output_matrix = np.transpose(input_matrix)
 
-    runner = XRTRunner(
-        verbose=args.verbose,
-        output_format=args.output_format,
-        instance_name="transpose",
-        runtime_loop_tiling_sizes=[4, 4],
-    )
     exit(
-        runner.run_test(
+        run_on_npu(
+            args,
             mlir_module,
             inputs=[input_matrix],
+            instance_name="transpose",
             expected_outputs=[expected_output_matrix],
         )
     )
diff --git a/programming_examples/data_transfer_transpose/dma_bf16/transpose_bf16.py b/programming_examples/data_transfer_transpose/dma_bf16/transpose_bf16.py
index cc4ef215c..b8ea421b1 100644
--- a/programming_examples/data_transfer_transpose/dma_bf16/transpose_bf16.py
+++ b/programming_examples/data_transfer_transpose/dma_bf16/transpose_bf16.py
@@ -12,7 +12,6 @@
 perform the transpose.
 """
 
-import argparse
 import numpy as np
 from ml_dtypes import bfloat16
 
@@ -22,8 +21,7 @@
 from air.dialects.air import *
 from air.dialects.memref import AllocOp, DeallocOp
 from air.dialects.func import FuncOp
-from air.backend.xrt_runner import XRTRunner, type_mapper
-from air.backend.xrt import XRTBackend
+from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu
 
 INOUT_DATATYPE = bfloat16
 
@@ -35,12 +33,7 @@ def build_module(m, k):
     memrefTyIn = MemRefType.get(shape=[m * k], element_type=xrt_dtype)
     memrefTyOut = MemRefType.get(shape=[k * m], element_type=xrt_dtype)
 
-    mem_space = IntegerAttr.get(T.i32(), MemorySpace.L1)
-    l1_type = MemRefType.get(
-        shape=[m * k],
-        element_type=xrt_dtype,
-        memory_space=mem_space,
-    )
+    l1_type = l1_memref_type([m * k], xrt_dtype)
 
     transpose_func = external_func("transpose_bf16", inputs=[l1_type, l1_type])
 
@@ -78,28 +71,9 @@ def herd_body(_tx, _ty, _sx, _sy, a, b):
     M = 64
     K = 32
 
-    parser = argparse.ArgumentParser(
-        prog="run.py",
-        description="Builds, runs, and tests the bf16 transpose example",
-    )
-    parser.add_argument("-v", "--verbose", action="store_true")
-    parser.add_argument("-p", "--print-module-only", action="store_true")
+    parser = make_air_parser("Builds, runs, and tests the bf16 transpose example")
     parser.add_argument("-m", type=int, default=M, help="Matrix rows")
     parser.add_argument("-k", type=int, default=K, help="Matrix columns")
-    parser.add_argument(
-        "--compile-mode",
-        type=str,
-        choices=["compile-only", "compile-and-run"],
-        dest="compile_mode",
-        default="compile-and-run",
-    )
-    parser.add_argument(
-        "--output-format",
-        type=str,
-        choices=["xclbin", "elf"],
-        default="xclbin",
-        dest="output_format",
-    )
     args = parser.parse_args()
 
     mlir_module = build_module(args.m, args.k)
@@ -110,25 +84,12 @@ def herd_body(_tx, _ty, _sx, _sy, a, b):
     input_matrix = np.random.uniform(-1.0, 1.0, (args.m, args.k)).astype(INOUT_DATATYPE)
     expected_output = np.transpose(input_matrix)
 
-    if args.compile_mode == "compile-and-run":
-        runner = XRTRunner(
-            verbose=args.verbose,
-            output_format=args.output_format,
+    exit(
+        run_on_npu(
+            args,
+            mlir_module,
+            inputs=[input_matrix.reshape(-1)],
             instance_name="transpose",
-            runtime_loop_tiling_sizes=[4, 4],
+            expected_outputs=[expected_output.reshape(-1)],
         )
-        exit(
-            runner.run_test(
-                mlir_module,
-                inputs=[input_matrix.reshape(-1)],
-                expected_outputs=[expected_output.reshape(-1)],
-            )
-        )
-    elif args.compile_mode == "compile-only":
-        backend = XRTBackend(
-            verbose=args.verbose,
-            output_format=args.output_format,
-            runtime_loop_tiling_sizes=[4, 4],
-        )
-        module_function = backend.compile(mlir_module)
-        backend.unload()
+    )
diff --git a/programming_examples/dequant_awq/dequant_awq.py b/programming_examples/dequant_awq/dequant_awq.py
index 6e0c2a23a..fa215aaf6 100644
--- a/programming_examples/dequant_awq/dequant_awq.py
+++ b/programming_examples/dequant_awq/dequant_awq.py
@@ -21,8 +21,7 @@
 from air.dialects.air import *
 from air.dialects.memref import AllocOp, DeallocOp
 from air.dialects.func import FuncOp, CallOp
-from air.backend.xrt_runner import XRTRunner, type_mapper
-from air.backend.xrt import XRTBackend
+from air.backend.xrt_runner import type_mapper, run_on_npu
 
 
 @module_builder
@@ -149,29 +148,16 @@ def herd_body(_tx, _ty, _sx, _sy, hw, hp, ho):
 
     packed_i8 = packed_weights.view(np.int8)
 
-    if args.compile_mode == "compile-and-run":
-        runner = XRTRunner(
-            verbose=args.verbose,
-            omit_pingpong=True,
-            output_format=args.output_format,
+    exit(
+        run_on_npu(
+            args,
+            mlir_module,
+            inputs=[packed_i8, params],
             instance_name="dequant",
+            expected_outputs=[ref_output],
+            rtol=1e-1,
+            atol=5e-2,
             runtime_loop_tiling_sizes=[4, 4],
-        )
-        exit(
-            runner.run_test(
-                mlir_module,
-                inputs=[packed_i8, params],
-                expected_outputs=[ref_output],
-                rtol=1e-1,
-                atol=5e-2,
-            )
-        )
-    elif args.compile_mode == "compile-only":
-        backend = XRTBackend(
-            verbose=args.verbose,
             omit_pingpong=True,
-            output_format=args.output_format,
-            runtime_loop_tiling_sizes=[4, 4],
         )
-        module_function = backend.compile(mlir_module)
-        backend.unload()
+    )
diff --git a/programming_examples/eltwise_add/eltwise_add.py b/programming_examples/eltwise_add/eltwise_add.py
index 75b7398c4..f354e29de 100644
--- a/programming_examples/eltwise_add/eltwise_add.py
+++ b/programming_examples/eltwise_add/eltwise_add.py
@@ -11,7 +11,10 @@
 configurable VECTOR_SIZE (default 16 for BF16, 8 for F32).
 """
 
-import argparse
+import os
+import sys
+
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 
 from ml_dtypes import bfloat16
 
@@ -24,8 +27,7 @@
 from air.dialects.vector import transfer_read, transfer_write
 from air.dialects.func import FuncOp
 from air.dialects.scf import for_, yield_
-from air.backend.xrt_runner import XRTRunner, type_mapper
-from air.backend.xrt import XRTBackend
+from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu
 
 import numpy as np
 
@@ -51,15 +53,8 @@ def build_module(
         n % (tile_n * total_tiles) == 0
     ), f"n ({n}) must be divisible by tile_n*total_tiles ({tile_n}*{total_tiles}={tile_n*total_tiles})"
 
-    # L3 MemRefTypes
     l3memrefTy = MemRefType.get(a_size, xrt_dtype_in)
-
-    # L1 MemRefTypes
-    l1MemrefTy = MemRefType.get(
-        shape=[tile_n],
-        element_type=xrt_dtype_in,
-        memory_space=IntegerAttr.get(T.i32(), MemorySpace.L1),
-    )
+    l1MemrefTy = l1_memref_type([tile_n], xrt_dtype_in)
 
     # Vectorization setup
     vectorize = vector_size > 0
@@ -67,8 +62,8 @@ def build_module(
         assert (
             tile_n % vector_size == 0
         ), f"tile_n ({tile_n}) must be divisible by vector_size ({vector_size})"
-        vecTy = VectorType.get([vector_size], xrt_dtype_in)
-        identity_map = AffineMapAttr.get(AffineMap.get_identity(1))
+        vecTy = vec_type(vector_size, xrt_dtype_in)
+        imap = identity_map_attr()
         index_type = IndexType.get()
 
     @FuncOp.from_py_func(l3memrefTy, l3memrefTy, l3memrefTy)
@@ -147,14 +142,10 @@ def herd_body(
                         sub_a = subview(l1_a_data.result, [j], [vector_size], [1])
                         sub_b = subview(l1_b_data.result, [j], [vector_size], [1])
                         sub_c = subview(l1_out_data.result, [j], [vector_size], [1])
-                        v_a = transfer_read(
-                            vecTy, sub_a, [c0], identity_map, cst0, [True]
-                        )
-                        v_b = transfer_read(
-                            vecTy, sub_b, [c0], identity_map, cst0, [True]
-                        )
+                        v_a = transfer_read(vecTy, sub_a, [c0], imap, cst0, [True])
+                        v_b = transfer_read(vecTy, sub_b, [c0], imap, cst0, [True])
                         v_c = arith.AddFOp(v_a, v_b)
-                        transfer_write(None, v_c, sub_c, [c0], identity_map, [True])
+                        transfer_write(None, v_c, sub_c, [c0], imap, [True])
                         yield_([])
                 else:
                     # Scalar compute loop (original)
@@ -190,20 +181,7 @@ def herd_body(
     VECTOR_SIZE = 16
     NUM_TILES = 2
 
-    parser = argparse.ArgumentParser(
-        prog="run.py",
-        description="Builds, runs, and tests the eltwise_add example",
-    )
-    parser.add_argument(
-        "-v",
-        "--verbose",
-        action="store_true",
-    )
-    parser.add_argument(
-        "-p",
-        "--print-module-only",
-        action="store_true",
-    )
+    parser = make_air_parser("Builds, runs, and tests the eltwise_add example")
     parser.add_argument(
         "--n",
         type=int,
@@ -242,22 +220,6 @@ def herd_body(
         default="bf16",
         help="Data type (default: bf16)",
     )
-    parser.add_argument(
-        "--compile-mode",
-        type=str,
-        choices=["compile-only", "compile-and-run"],
-        dest="compile_mode",
-        default="compile-and-run",
-        help="Configure to whether to run after compile",
-    )
-    parser.add_argument(
-        "--output-format",
-        type=str,
-        choices=["xclbin", "elf"],
-        default="xclbin",
-        dest="output_format",
-        help="Output format for the compiled binary (default: xclbin)",
-    )
     args = parser.parse_args()
 
     if args.dtype == "bf16":
@@ -281,57 +243,36 @@ def herd_body(
     input_a = np.random.uniform(0, 4, args.n).astype(INPUT_DATATYPE)
     input_b = np.random.uniform(0, 4, args.n).astype(INPUT_DATATYPE)
 
-    if args.compile_mode == "compile-and-run":
-
-        # Stochastically sample num_sample results, and pass to XRTRunner backend for verification.
-        num_samples = 100
-        sampled_indices = np.vstack(
-            [
-                np.random.randint(0, args.n, num_samples),  # i indices
-            ]
-        )
+    # Stochastically sample num_sample results, and pass to XRTRunner backend for verification.
+    num_samples = 100
+    sampled_indices = np.vstack(
+        [
+            np.random.randint(0, args.n, num_samples),  # i indices
+        ]
+    )
 
-        # Compute reference results for sampled indices
-        sampled_values = np.array(
-            [input_a[i] + input_b[i] for i in zip(*sampled_indices)],
-            dtype=INPUT_DATATYPE,
-        )
+    # Compute reference results for sampled indices
+    sampled_values = np.array(
+        [input_a[i] + input_b[i] for i in zip(*sampled_indices)],
+        dtype=INPUT_DATATYPE,
+    )
 
-        # Store as a dictionary
-        sampled_data = {
-            "shape": (args.n),
-            "indices": sampled_indices,
-            "values": sampled_values,
-        }
-
-        ###### Compile and test
-        runner = XRTRunner(
-            verbose=args.verbose,
-            omit_while_true_loop=False,
-            output_format=args.output_format,
+    # Store as a dictionary
+    sampled_data = {
+        "shape": (args.n),
+        "indices": sampled_indices,
+        "values": sampled_values,
+    }
+
+    # BF16 has ~0.8% relative precision; use looser tolerance
+    rtol = 0.01 if INPUT_DATATYPE == bfloat16 else 1e-3
+    exit(
+        run_on_npu(
+            args,
+            mlir_module,
+            inputs=[input_a, input_b],
             instance_name="eltwise_add",
-            runtime_loop_tiling_sizes=[4, 4],
-        )
-        # BF16 has ~0.8% relative precision; use looser tolerance
-        rtol = 0.01 if INPUT_DATATYPE == bfloat16 else 1e-3
-        exit(
-            runner.run_test(
-                mlir_module,
-                inputs=[input_a, input_b],
-                stochastic_expected_outputs=[sampled_data],
-                rtol=rtol,
-            )
-        )
-
-    elif args.compile_mode == "compile-only":
-        ###### Compile only
-        backend = XRTBackend(
-            verbose=args.verbose,
-            omit_while_true_loop=False,
-            omit_auto_broadcast=True,
-            output_format=args.output_format,
-            runtime_loop_tiling_sizes=[4, 4],
+            stochastic_expected_outputs=[sampled_data],
+            rtol=rtol,
         )
-        module_function = backend.compile(mlir_module)
-
-        backend.unload()
+    )
diff --git a/programming_examples/eltwise_add_with_l2/eltwise_add.py b/programming_examples/eltwise_add_with_l2/eltwise_add.py
index 8b87ae7da..9fc393cf6 100644
--- a/programming_examples/eltwise_add_with_l2/eltwise_add.py
+++ b/programming_examples/eltwise_add_with_l2/eltwise_add.py
@@ -1,17 +1,14 @@
 # Copyright (C) 2025, Advanced Micro Devices, Inc.
 # SPDX-License-Identifier: MIT
-import argparse
 from ml_dtypes import bfloat16
 
 from air.ir import *
-from air.dialects.affine import apply as affine_apply
 from air.dialects.air import *
 from air.dialects.arith import ConstantOp
 from air.dialects.memref import AllocOp, DeallocOp, load, store
 from air.dialects.func import FuncOp
 from air.dialects.scf import for_, yield_
-from air.backend.xrt_runner import XRTRunner, type_mapper
-from air.backend.xrt import XRTBackend
+from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu
 
 import numpy as np
 
@@ -33,18 +30,10 @@ def build_module(n, tile_n, np_dtype_in):
     l3memrefTy = MemRefType.get(a_size, xrt_dtype_in)
 
     # L2 MemRefTypes
-    l2MemrefTy = MemRefType.get(
-        shape=a_size,
-        element_type=xrt_dtype_in,
-        memory_space=IntegerAttr.get(T.i32(), MemorySpace.L2),
-    )
+    l2MemrefTy = l2_memref_type(a_size, xrt_dtype_in)
 
     # L1 MemRefTypes
-    l1MemrefTy = MemRefType.get(
-        shape=[tile_n],
-        element_type=xrt_dtype_in,
-        memory_space=IntegerAttr.get(T.i32(), MemorySpace.L1),
-    )
+    l1MemrefTy = l1_memref_type([tile_n], xrt_dtype_in)
 
     @FuncOp.from_py_func(l3memrefTy, l3memrefTy, l3memrefTy)
     def eltwise_add(arg0, arg1, arg2):
@@ -102,20 +91,7 @@ def herd_body(
 
                     for _l_ivx in range_(0, n, tile_n * num_tiles):
 
-                        offset_map = AffineMap.get(
-                            0,
-                            2,
-                            [
-                                AffineExpr.get_add(
-                                    AffineSymbolExpr.get(0),
-                                    AffineExpr.get_mul(
-                                        AffineSymbolExpr.get(1),
-                                        AffineConstantExpr.get(tile_n),
-                                    ),
-                                )
-                            ],
-                        )
-                        offset = affine_apply(offset_map, [_l_ivx, _ty])
+                        offset = tile_offset_1d(_l_ivx, _ty, tile_n)
 
                         dma_memcpy_nd(
                             l1_a_data,
@@ -171,20 +147,7 @@ def herd_body(
     TILE_N = 1024
     INPUT_DATATYPE = np.float32
 
-    parser = argparse.ArgumentParser(
-        prog="run.py",
-        description="Builds, runs, and tests the passthrough_dma example",
-    )
-    parser.add_argument(
-        "-v",
-        "--verbose",
-        action="store_true",
-    )
-    parser.add_argument(
-        "-p",
-        "--print-module-only",
-        action="store_true",
-    )
+    parser = make_air_parser("Builds, runs, and tests the passthrough_dma example")
     parser.add_argument(
         "--n",
         type=int,
@@ -192,22 +155,6 @@ def herd_body(
         help="Total number of elements",
     )
     parser.add_argument("--tile-n", type=int, default=TILE_N, help="Tile size")
-    parser.add_argument(
-        "--compile-mode",
-        type=str,
-        choices=["compile-only", "compile-and-run"],
-        dest="compile_mode",
-        default="compile-and-run",
-        help="Configure to whether to run after compile",
-    )
-    parser.add_argument(
-        "--output-format",
-        type=str,
-        choices=["xclbin", "elf"],
-        default="xclbin",
-        dest="output_format",
-        help="Output format for the compiled binary (default: xclbin)",
-    )
 
     args = parser.parse_args()
 
@@ -225,55 +172,34 @@ def herd_body(
     input_b = np.arange(0, args.n, dtype=np.int64).reshape(args.n)
     input_b = input_b.astype(INPUT_DATATYPE)
 
-    if args.compile_mode == "compile-and-run":
-
-        # Stochastically sample num_sample results, and pass to XRTRunner backend for verification.
-        num_samples = 100
-        sampled_indices = np.vstack(
-            [
-                np.random.randint(0, args.n, num_samples),  # i indices
-            ]
-        )
-
-        # Compute reference results for sampled indices
-        sampled_values = np.array(
-            [input_a[i] + input_b[i] for i in zip(*sampled_indices)],
-            dtype=INPUT_DATATYPE,
-        )
-
-        # Store as a dictionary
-        sampled_data = {
-            "shape": (args.n),
-            "indices": sampled_indices,
-            "values": sampled_values,
-        }
-
-        ###### Compile and test
-        runner = XRTRunner(
-            verbose=args.verbose,
-            omit_while_true_loop=False,
-            output_format=args.output_format,
-            instance_name="eltwise_add",
-            runtime_loop_tiling_sizes=[4, 4],
-        )
-        exit(
-            runner.run_test(
-                mlir_module,
-                inputs=[input_a, input_b],
-                stochastic_expected_outputs=[sampled_data],
-                rtol=1e-3,
-            )
-        )
+    # Stochastically sample num_sample results, and pass to XRTRunner backend for verification.
+    num_samples = 100
+    sampled_indices = np.vstack(
+        [
+            np.random.randint(0, args.n, num_samples),  # i indices
+        ]
+    )
 
-    elif args.compile_mode == "compile-only":
-        ###### Compile only
-        backend = XRTBackend(
-            verbose=args.verbose,
-            omit_while_true_loop=False,
-            omit_auto_broadcast=True,
-            output_format=args.output_format,
-            runtime_loop_tiling_sizes=[4, 4],
-        )
-        module_function = backend.compile(mlir_module)
+    # Compute reference results for sampled indices
+    sampled_values = np.array(
+        [input_a[i] + input_b[i] for i in zip(*sampled_indices)],
+        dtype=INPUT_DATATYPE,
+    )
 
-        backend.unload()
+    # Store as a dictionary
+    sampled_data = {
+        "shape": (args.n),
+        "indices": sampled_indices,
+        "values": sampled_values,
+    }
+
+    run_on_npu(
+        args,
+        mlir_module,
+        inputs=[input_a, input_b],
+        stochastic_expected_outputs=[sampled_data],
+        instance_name="eltwise_add",
+        omit_while_true_loop=False,
+        runtime_loop_tiling_sizes=[4, 4],
+        rtol=1e-3,
+    )
diff --git a/programming_examples/ffn_swiglu/decode/ffn_decode.py b/programming_examples/ffn_swiglu/decode/ffn_decode.py
index c96eb340c..615c179b7 100644
--- a/programming_examples/ffn_swiglu/decode/ffn_decode.py
+++ b/programming_examples/ffn_swiglu/decode/ffn_decode.py
@@ -30,8 +30,7 @@
 from air.dialects.memref import AllocOp, DeallocOp
 from air.dialects.func import FuncOp, CallOp
 from air.dialects.scf import for_, yield_
-from air.backend.xrt_runner import XRTRunner, type_mapper
-from air.backend.xrt import XRTBackend
+from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu
 
 range_ = for_
 
@@ -266,6 +265,13 @@ def h(_tx, _ty, _sx, _sy, hi, hw, ho):
         default="elf",
         dest="output_format",
     )
+    parser.add_argument(
+        "--compile-mode",
+        type=str,
+        choices=["compile-only", "compile-and-run"],
+        dest="compile_mode",
+        default="compile-and-run",
+    )
     args = parser.parse_args()
 
     dim = args.dim
@@ -312,20 +318,16 @@ def pack_weights_partitioned(W, dim, dim_m, num_cols):
     intermediate = silu_gate * up
     ref_out = (W_down.astype(np.float32) @ intermediate).astype(INPUT_DATATYPE)
 
-    runner = XRTRunner(
-        verbose=args.verbose,
-        omit_while_true_loop=False,
-        omit_pingpong=True,
-        output_format=args.output_format,
-        instance_name="ffn_swiglu",
-        runtime_loop_tiling_sizes=[4, 4],
-    )
     exit(
-        runner.run_test(
+        run_on_npu(
+            args,
             mlir_module,
             inputs=[x, packed_weights, gate_buf, up_buf, inter_buf],
+            instance_name="ffn_swiglu",
             expected_outputs=[ref_out],
             rtol=1e0,
             atol=0.5,
+            runtime_loop_tiling_sizes=[4, 4],
+            omit_pingpong=True,
         )
     )
diff --git a/programming_examples/ffn_swiglu/prefill/ffn_prefill.py b/programming_examples/ffn_swiglu/prefill/ffn_prefill.py
index 5309485b7..393c5bade 100644
--- a/programming_examples/ffn_swiglu/prefill/ffn_prefill.py
+++ b/programming_examples/ffn_swiglu/prefill/ffn_prefill.py
@@ -30,8 +30,7 @@
 from air.dialects.memref import AllocOp, DeallocOp
 from air.dialects.func import FuncOp, CallOp
 from air.dialects.scf import for_, yield_
-from air.backend.xrt_runner import XRTRunner, type_mapper
-from air.backend.xrt import XRTBackend
+from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu
 
 range_ = for_
 
@@ -268,6 +267,13 @@ def h(_tx, _ty, _sx, _sy, hi, hw, ho):
         default="elf",
         dest="output_format",
     )
+    parser.add_argument(
+        "--compile-mode",
+        type=str,
+        choices=["compile-only", "compile-and-run"],
+        dest="compile_mode",
+        default="compile-and-run",
+    )
     args = parser.parse_args()
 
     seq_len = args.seq_len
@@ -321,16 +327,9 @@ def pack_weights(W, dim, dim_n, num_cols):
     intermediate = silu_gate * up
     ref_out = (intermediate @ W_down.astype(np.float32).T).astype(INPUT_DATATYPE)
 
-    runner = XRTRunner(
-        verbose=args.verbose,
-        omit_while_true_loop=False,
-        omit_pingpong=True,
-        runtime_loop_tiling_sizes=[4, 4],
-        output_format=args.output_format,
-        instance_name="ffn_swiglu",
-    )
     exit(
-        runner.run_test(
+        run_on_npu(
+            args,
             mlir_module,
             inputs=[
                 x.reshape(-1),
@@ -339,8 +338,11 @@ def pack_weights(W, dim, dim_n, num_cols):
                 up_buf,
                 inter_buf,
             ],
+            instance_name="ffn_swiglu",
             expected_outputs=[ref_out.reshape(-1)],
             rtol=1e0,
             atol=0.5,
+            runtime_loop_tiling_sizes=[4, 4],
+            omit_pingpong=True,
         )
     )
diff --git a/programming_examples/flash_attention/dataflow_based/attn.py b/programming_examples/flash_attention/dataflow_based/attn.py
index d31f338c5..b69c27ebe 100644
--- a/programming_examples/flash_attention/dataflow_based/attn.py
+++ b/programming_examples/flash_attention/dataflow_based/attn.py
@@ -12,8 +12,7 @@
 from air.dialects.func import FuncOp, CallOp
 from air.dialects.scf import for_, yield_
 from air.dialects import scf, affine, arith
-from air.backend.xrt_runner import XRTRunner, type_mapper
-from air.backend.xrt import XRTBackend
+from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu
 from ml_dtypes import bfloat16
 
 range_ = for_
@@ -733,6 +732,13 @@ def herd_body_final(arg22, arg23, arg24, arg25, arg27, arg28, arg29):
         default="aie2",
         help="Target architecture (default: aie2)",
     )
+    parser.add_argument(
+        "--compile-mode",
+        type=str,
+        choices=["compile-only", "compile-and-run"],
+        dest="compile_mode",
+        default="compile-and-run",
+    )
 
     args = parser.parse_args()
 
@@ -753,9 +759,6 @@ def herd_body_final(arg22, arg23, arg24, arg25, arg27, arg28, arg29):
         print(mlir_module)
         exit(0)
 
-    # Import XRT dependencies only when running tests
-    from air.backend.xrt_runner import XRTRunner, type_mapper
-    from air.backend.xrt import XRTBackend
     from air.extras import types as extrasT
     from ml_dtypes import bfloat16
 
@@ -800,19 +803,15 @@ def herd_body_final(arg22, arg23, arg24, arg25, arg27, arg28, arg29):
 
     lazy_attn_output = (Gp / sp).astype(OUTPUT_DATATYPE)
 
-    runner = XRTRunner(
-        omit_while_true_loop=False,
-        omit_pingpong=True,
-        verbose=False,
-        runtime_loop_tiling_sizes=[1, 1],
-        output_format=args.output_format,
-        instance_name="attention_bf16",
-    )
     exit(
-        runner.run_test(
+        run_on_npu(
+            args,
             mlir_module,
             inputs=[input_q_scaled, input_k, input_v, input_m],
+            instance_name="attention_bf16",
             expected_outputs=[lazy_attn_output],
             rtol=1e-1,
+            runtime_loop_tiling_sizes=[1, 1],
+            omit_pingpong=True,
         )
     )
diff --git a/programming_examples/flash_attention/kernel_fusion_based/attn.py b/programming_examples/flash_attention/kernel_fusion_based/attn.py
index 2b9882a9d..0552cf5a8 100644
--- a/programming_examples/flash_attention/kernel_fusion_based/attn.py
+++ b/programming_examples/flash_attention/kernel_fusion_based/attn.py
@@ -50,6 +50,7 @@
 from air.dialects.func import FuncOp, CallOp
 from air.dialects.scf import for_ as scf_range, yield_
 from air.dialects import scf, affine, arith
+from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu
 
 
 @module_builder
@@ -1259,8 +1260,6 @@ def _emit_counter_increment():
         print(mlir_module)
         exit(0)
 
-    from air.backend.xrt_runner import XRTRunner
-    from air.backend.xrt import XRTBackend
     from ml_dtypes import bfloat16
 
     INPUT_DATATYPE = OUTPUT_DATATYPE = bfloat16
@@ -1305,37 +1304,19 @@ def _emit_counter_increment():
     )
 
     tiling = [1, 1, 1] if dv_chunks_host > 1 else [1, 1]
-    runner = XRTRunner(
-        omit_while_true_loop=False,
-        omit_pingpong="all",
-        verbose=args.verbose,
-        runtime_loop_tiling_sizes=tiling,
-        output_format=args.output_format,
-        instance_name="attention_bf16",
-        target_device="npu2",
-    )
-
-    if args.compile_mode == "compile-and-run":
-        exit(
-            runner.run_test(
-                mlir_module,
-                inputs=[input_q, input_k, input_v],
-                expected_outputs=[sdpa_output_transposed],
-                atol=0.15,
-                rtol=0.04,
-                max_mismatch_percentage=0.5,
-                min_correlation=0.99,
-            )
-        )
-    elif args.compile_mode == "compile-only":
-        backend = XRTBackend(
-            omit_while_true_loop=False,
-            omit_pingpong="all",
-            verbose=args.verbose,
-            runtime_loop_tiling_sizes=tiling,
-            output_format=args.output_format,
+    exit(
+        run_on_npu(
+            args,
+            mlir_module,
+            inputs=[input_q, input_k, input_v],
             instance_name="attention_bf16",
+            expected_outputs=[sdpa_output_transposed],
+            atol=0.15,
+            rtol=0.04,
+            max_mismatch_percentage=0.5,
+            min_correlation=0.99,
+            runtime_loop_tiling_sizes=tiling,
+            omit_pingpong="all",
             target_device="npu2",
         )
-        module_function = backend.compile(mlir_module)
-        print("Compilation complete.")
+    )
diff --git a/programming_examples/gelu/gelu.py b/programming_examples/gelu/gelu.py
index fe33c239b..2c0bfb80b 100644
--- a/programming_examples/gelu/gelu.py
+++ b/programming_examples/gelu/gelu.py
@@ -15,21 +15,24 @@
 Computation is vectorized using vector.transfer_read/write.
 """
 
-import argparse
+import os
+import sys
+
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
 import numpy as np
 from ml_dtypes import bfloat16
 
 from air.ir import *
-from air.dialects.affine import apply as affine_apply
 from air.dialects.air import *
 from air.dialects import arith, math as math_dialect
 from air.dialects.arith import ConstantOp
-from air.dialects.memref import AllocOp, DeallocOp, subview
-from air.dialects.vector import transfer_read, transfer_write, BroadcastOp
+from air.dialects.memref import AllocOp, DeallocOp
+from air.dialects.vector import BroadcastOp
 from air.dialects.func import FuncOp
 from air.dialects.scf import for_, yield_
-from air.backend.xrt_runner import XRTRunner, type_mapper
-from air.backend.xrt import XRTBackend
+from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu
+from utils import vec_read, vec_write
 
 range_ = for_
 
@@ -47,14 +50,9 @@ def build_module(n, tile_n, np_dtype_in, vector_size=16):
     index_type = IndexType.get()
 
     l3memrefTy = MemRefType.get([n], xrt_dtype_in)
-    l1MemrefTy = MemRefType.get(
-        shape=[tile_n],
-        element_type=xrt_dtype_in,
-        memory_space=IntegerAttr.get(T.i32(), MemorySpace.L1),
-    )
-
-    vecTy = VectorType.get([VECTOR_SIZE], xrt_dtype_in)
-    identity_map = AffineMapAttr.get(AffineMap.get_identity(1))
+    l1MemrefTy = l1_memref_type([tile_n], xrt_dtype_in)
+    vecTy = vec_type(VECTOR_SIZE, xrt_dtype_in)
+    imap = identity_map_attr()
 
     @FuncOp.from_py_func(l3memrefTy, l3memrefTy)
     def gelu(arg0, arg1):
@@ -65,20 +63,7 @@ def herd_body(_tx, _ty, _sx, _sy, _l3_in, _l3_out):
             l1_out = AllocOp(l1MemrefTy, [], [])
 
             for _l_ivx in range_(0, n, tile_n * num_tiles):
-                offset_map = AffineMap.get(
-                    0,
-                    2,
-                    [
-                        AffineExpr.get_add(
-                            AffineSymbolExpr.get(0),
-                            AffineExpr.get_mul(
-                                AffineSymbolExpr.get(1),
-                                AffineConstantExpr.get(tile_n),
-                            ),
-                        )
-                    ],
-                )
-                offset = affine_apply(offset_map, [_l_ivx, _ty])
+                offset = tile_offset_1d(_l_ivx, _ty, tile_n)
 
                 dma_memcpy_nd(
                     l1_in,
@@ -102,10 +87,7 @@ def herd_body(_tx, _ty, _sx, _sy, _l3_in, _l3_out):
                 v_s2opi = BroadcastOp(vecTy, s2opi_const)
 
                 for j in range_(c0, cTileN, cVecSize):
-                    sub_in = subview(l1_in.result, [j], [VECTOR_SIZE], [1])
-                    sub_out = subview(l1_out.result, [j], [VECTOR_SIZE], [1])
-
-                    v_x = transfer_read(vecTy, sub_in, [c0], identity_map, cst0, [True])
+                    v_x = vec_read(l1_in, j, VECTOR_SIZE, c0, vecTy, cst0, imap)
 
                     # GELU(x) = 0.5 * x * (1 + tanh(sqrt(2/pi) * (x + 0.044715 * x^3)))
                     # Uses hardware tanh intrinsic — no exp or division needed.
@@ -119,7 +101,7 @@ def herd_body(_tx, _ty, _sx, _sy, _l3_in, _l3_out):
                     v_half_x = arith.mulf(v_x, v_half.result)
                     v_gelu = arith.mulf(v_half_x, v_one_plus_tanh)
 
-                    transfer_write(None, v_gelu, sub_out, [c0], identity_map, [True])
+                    vec_write(v_gelu, l1_out, j, VECTOR_SIZE, c0, imap)
                     yield_([])
 
                 dma_memcpy_nd(
@@ -139,31 +121,12 @@ def herd_body(_tx, _ty, _sx, _sy, _l3_in, _l3_out):
     TILE_N = 1024
     INPUT_DATATYPE = bfloat16
 
-    parser = argparse.ArgumentParser(
-        prog="run.py",
-        description="Builds, runs, and tests the GELU example",
-    )
-    parser.add_argument("-v", "--verbose", action="store_true")
-    parser.add_argument("-p", "--print-module-only", action="store_true")
+    parser = make_air_parser("Builds, runs, and tests the GELU example")
     parser.add_argument("--n", type=int, default=N, help="Total number of elements")
     parser.add_argument("--tile-n", type=int, default=TILE_N, help="Tile size")
     parser.add_argument(
         "--vector-size", type=int, default=16, help="Vector size for SIMD operations"
     )
-    parser.add_argument(
-        "--compile-mode",
-        type=str,
-        choices=["compile-only", "compile-and-run"],
-        dest="compile_mode",
-        default="compile-and-run",
-    )
-    parser.add_argument(
-        "--output-format",
-        type=str,
-        choices=["xclbin", "elf"],
-        default="xclbin",
-        dest="output_format",
-    )
     args = parser.parse_args()
 
     mlir_module = build_module(args.n, args.tile_n, INPUT_DATATYPE, args.vector_size)
@@ -174,62 +137,42 @@ def herd_body(_tx, _ty, _sx, _sy, _l3_in, _l3_out):
     np.random.seed(0)
     input_a = np.random.uniform(-4.0, 4.0, args.n).astype(INPUT_DATATYPE)
 
-    if args.compile_mode == "compile-and-run":
-        num_samples = 100
-        sampled_indices = np.vstack([np.random.randint(0, args.n, num_samples)])
-
-        # Match hardware bf16 computation: each op truncates to bf16
-        def gelu_ref(x):
-            x_bf = INPUT_DATATYPE(x)
-            x2 = INPUT_DATATYPE(np.float32(x_bf) * np.float32(x_bf))
-            x3 = INPUT_DATATYPE(np.float32(x_bf) * np.float32(x2))
-            beta_x3 = INPUT_DATATYPE(
-                np.float32(x3) * np.float32(INPUT_DATATYPE(GELU_BETA))
-            )
-            inner = INPUT_DATATYPE(np.float32(x_bf) + np.float32(beta_x3))
-            scaled = INPUT_DATATYPE(
-                np.float32(inner) * np.float32(INPUT_DATATYPE(SQRT_2_OVER_PI))
-            )
-            tanh_val = INPUT_DATATYPE(np.tanh(np.float32(scaled)))
-            one_plus_tanh = INPUT_DATATYPE(
-                np.float32(tanh_val) + np.float32(INPUT_DATATYPE(1.0))
-            )
-            half_x = INPUT_DATATYPE(np.float32(x_bf) * np.float32(INPUT_DATATYPE(0.5)))
-            return INPUT_DATATYPE(np.float32(half_x) * np.float32(one_plus_tanh))
-
-        sampled_values = np.array(
-            [gelu_ref(input_a[i]) for i in zip(*sampled_indices)],
-            dtype=INPUT_DATATYPE,
-        )
-        sampled_data = {
-            "shape": (args.n,),
-            "indices": sampled_indices,
-            "values": sampled_values,
-        }
-
-        runner = XRTRunner(
-            verbose=args.verbose,
-            omit_while_true_loop=False,
-            output_format=args.output_format,
-            instance_name="gelu",
-            runtime_loop_tiling_sizes=[4, 4],
+    # Match hardware bf16 computation: each op truncates to bf16
+    def gelu_ref(x):
+        x_bf = INPUT_DATATYPE(x)
+        x2 = INPUT_DATATYPE(np.float32(x_bf) * np.float32(x_bf))
+        x3 = INPUT_DATATYPE(np.float32(x_bf) * np.float32(x2))
+        beta_x3 = INPUT_DATATYPE(np.float32(x3) * np.float32(INPUT_DATATYPE(GELU_BETA)))
+        inner = INPUT_DATATYPE(np.float32(x_bf) + np.float32(beta_x3))
+        scaled = INPUT_DATATYPE(
+            np.float32(inner) * np.float32(INPUT_DATATYPE(SQRT_2_OVER_PI))
         )
-        exit(
-            runner.run_test(
-                mlir_module,
-                inputs=[input_a],
-                stochastic_expected_outputs=[sampled_data],
-                rtol=1e-1,
-                atol=5e-2,
-            )
+        tanh_val = INPUT_DATATYPE(np.tanh(np.float32(scaled)))
+        one_plus_tanh = INPUT_DATATYPE(
+            np.float32(tanh_val) + np.float32(INPUT_DATATYPE(1.0))
         )
+        half_x = INPUT_DATATYPE(np.float32(x_bf) * np.float32(INPUT_DATATYPE(0.5)))
+        return INPUT_DATATYPE(np.float32(half_x) * np.float32(one_plus_tanh))
 
-    elif args.compile_mode == "compile-only":
-        backend = XRTBackend(
-            verbose=args.verbose,
-            omit_while_true_loop=False,
-            output_format=args.output_format,
-            runtime_loop_tiling_sizes=[4, 4],
+    sampled_indices = np.vstack([np.random.randint(0, args.n, 100)])
+    sampled_values = np.array(
+        [gelu_ref(input_a[i]) for i in zip(*sampled_indices)],
+        dtype=INPUT_DATATYPE,
+    )
+    sampled_data = {
+        "shape": (args.n,),
+        "indices": sampled_indices,
+        "values": sampled_values,
+    }
+
+    exit(
+        run_on_npu(
+            args,
+            mlir_module,
+            inputs=[input_a],
+            instance_name="gelu",
+            stochastic_expected_outputs=[sampled_data],
+            rtol=1e-1,
+            atol=5e-2,
         )
-        module_function = backend.compile(mlir_module)
-        backend.unload()
+    )
diff --git a/programming_examples/herd_dataflow/run.py b/programming_examples/herd_dataflow/run.py
index 177813e35..5dbaeb504 100644
--- a/programming_examples/herd_dataflow/run.py
+++ b/programming_examples/herd_dataflow/run.py
@@ -20,7 +20,7 @@
 from air.dialects.scf import for_, yield_
 from air.dialects.arith import ConstantOp
 from air.dialects.memref import AllocOp, DeallocOp, load, store, subview
-from air.backend.xrt_runner import XRTRunner
+from air.backend.xrt_runner import run_on_npu
 from ml_dtypes import bfloat16
 
 # Constants for buffer sizes and loop bounds
@@ -69,6 +69,13 @@ def parse_args():
         dest="output_format",
         help="Output format for the compiled binary (default: xclbin)",
     )
+    parser.add_argument(
+        "--compile-mode",
+        type=str,
+        choices=["compile-only", "compile-and-run"],
+        dest="compile_mode",
+        default="compile-and-run",
+    )
     args = parser.parse_args()
     return args
 
@@ -456,20 +463,15 @@ def main():
     B = np.random.rand(M_SIZE, N_SIZE).astype(bfloat16)
     C = (A + B + 3.0).astype(bfloat16)
 
-    # Run the module using XRTRunner
-    runner = XRTRunner(
-        omit_while_true_loop=False,
-        verbose=False,
-        runtime_loop_tiling_sizes=[2, 2],
-        output_format=args.output_format,
-        instance_name="func1",
-    )
     exit(
-        runner.run_test(
+        run_on_npu(
+            args,
             mlir_module,
             inputs=[A, B],
+            instance_name="func1",
             expected_outputs=[C],
             rtol=1e-2,
+            runtime_loop_tiling_sizes=[2, 2],
         )
     )
 
diff --git a/programming_examples/layer_norm/layer_norm.py b/programming_examples/layer_norm/layer_norm.py
index 93ddeb010..a60d9e773 100644
--- a/programming_examples/layer_norm/layer_norm.py
+++ b/programming_examples/layer_norm/layer_norm.py
@@ -14,7 +14,7 @@
 configurable VECTOR_SIZE (default 16 for AIE2).
 """
 
-import argparse
+import numpy as np
 from ml_dtypes import bfloat16
 
 from air.ir import *
@@ -29,8 +29,7 @@
 )
 from air.dialects.func import FuncOp
 from air.dialects.scf import for_, yield_
-from air.backend.xrt_runner import XRTRunner, type_mapper
-from air.backend.xrt import XRTBackend
+from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu
 
 range_ = for_
 
@@ -44,16 +43,15 @@ def build_module(M, N, np_dtype, vector_size=16):
         N % vector_size == 0
     ), f"N ({N}) must be divisible by vector_size ({vector_size})"
 
-    vecTy = VectorType.get([vector_size], xrt_dtype)
-    identity_map = AffineMapAttr.get(AffineMap.get_identity(1))
+    vecTy = vec_type(vector_size, xrt_dtype)
+    identity_map = identity_map_attr()
 
     # L3 types
     l3MemrefTy = MemRefType.get([M, N], xrt_dtype)
 
     # L1 types
-    l1_mem_space = IntegerAttr.get(T.i32(), MemorySpace.L1)
-    l1RowTy = MemRefType.get([N], xrt_dtype, memory_space=l1_mem_space)
-    l1VecTy = MemRefType.get([vector_size], xrt_dtype, memory_space=l1_mem_space)
+    l1RowTy = l1_memref_type([N], xrt_dtype)
+    l1VecTy = l1_memref_type([vector_size], xrt_dtype)
 
     @FuncOp.from_py_func(l3MemrefTy, l3MemrefTy)
     def layer_norm(arg0, arg1):
@@ -177,12 +175,7 @@ def herd_body(_tx, _ty, _sx, _sy, l3_in, l3_out):
     VECTOR_SIZE = 16
     INPUT_DATATYPE = bfloat16
 
-    parser = argparse.ArgumentParser(
-        prog="run.py",
-        description="Builds, runs, and tests the layer normalization example",
-    )
-    parser.add_argument("-v", "--verbose", action="store_true")
-    parser.add_argument("-p", "--print-module-only", action="store_true")
+    parser = make_air_parser("Builds, runs, and tests the layer normalization example")
     parser.add_argument("--M", type=int, default=M_DEFAULT, help="M dimension (rows)")
     parser.add_argument("--N", type=int, default=N_DEFAULT, help="N dimension (cols)")
     parser.add_argument(
@@ -191,20 +184,6 @@ def herd_body(_tx, _ty, _sx, _sy, l3_in, l3_out):
         default=VECTOR_SIZE,
         help="Vector size for SIMD operations",
     )
-    parser.add_argument(
-        "--compile-mode",
-        type=str,
-        choices=["compile-only", "compile-and-run"],
-        dest="compile_mode",
-        default="compile-and-run",
-    )
-    parser.add_argument(
-        "--output-format",
-        type=str,
-        choices=["xclbin", "elf"],
-        default="xclbin",
-        dest="output_format",
-    )
     args = parser.parse_args()
 
     mlir_module = build_module(args.M, args.N, INPUT_DATATYPE, args.vector_size)
@@ -222,30 +201,14 @@ def herd_body(_tx, _ty, _sx, _sy, l3_in, l3_out):
     rstd = 1.0 / np.sqrt(variance + eps)
     y_expected = ((x_input - mean) * rstd).astype(INPUT_DATATYPE)
 
-    if args.compile_mode == "compile-and-run":
-        runner = XRTRunner(
-            verbose=args.verbose,
-            omit_while_true_loop=False,
-            output_format=args.output_format,
+    exit(
+        run_on_npu(
+            args,
+            mlir_module,
+            inputs=[x_input],
             instance_name="layer_norm",
-            runtime_loop_tiling_sizes=[4, 4],
+            expected_outputs=[y_expected],
+            rtol=5e-2,
+            atol=5e-1,
         )
-        exit(
-            runner.run_test(
-                mlir_module,
-                inputs=[x_input],
-                expected_outputs=[y_expected],
-                rtol=5e-2,
-                atol=5e-1,
-            )
-        )
-
-    elif args.compile_mode == "compile-only":
-        backend = XRTBackend(
-            verbose=args.verbose,
-            omit_while_true_loop=False,
-            output_format=args.output_format,
-            runtime_loop_tiling_sizes=[4, 4],
-        )
-        module_function = backend.compile(mlir_module)
-        backend.unload()
+    )
diff --git a/programming_examples/leaky_relu/leaky_relu.py b/programming_examples/leaky_relu/leaky_relu.py
index 46fd4dadf..ca10f52bc 100644
--- a/programming_examples/leaky_relu/leaky_relu.py
+++ b/programming_examples/leaky_relu/leaky_relu.py
@@ -11,23 +11,26 @@
 configurable VECTOR_SIZE (default 16).
 """
 
-import argparse
+import os
+import sys
+
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
 import numpy as np
 
 np.random.seed(42)
 from ml_dtypes import bfloat16
 
 from air.ir import *
-from air.dialects.affine import apply as affine_apply
 from air.dialects.air import *
 from air.dialects import arith
 from air.dialects.arith import ConstantOp
-from air.dialects.memref import AllocOp, DeallocOp, subview
-from air.dialects.vector import transfer_read, transfer_write, BroadcastOp
+from air.dialects.memref import AllocOp, DeallocOp
+from air.dialects.vector import BroadcastOp
 from air.dialects.func import FuncOp
 from air.dialects.scf import for_, yield_
-from air.backend.xrt_runner import XRTRunner, type_mapper
-from air.backend.xrt import XRTBackend
+from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu
+from utils import vec_read, vec_write
 
 range_ = for_
 
@@ -41,18 +44,10 @@ def build_module(n, tile_n, np_dtype_in, alpha=0.01, vector_size=16):
     VECTOR_SIZE = vector_size
     index_type = IndexType.get()
 
-    # L3 MemRefTypes
     l3memrefTy = MemRefType.get([n], xrt_dtype_in)
-
-    # L1 MemRefTypes
-    l1MemrefTy = MemRefType.get(
-        shape=[tile_n],
-        element_type=xrt_dtype_in,
-        memory_space=IntegerAttr.get(T.i32(), MemorySpace.L1),
-    )
-
-    vecTy = VectorType.get([VECTOR_SIZE], xrt_dtype_in)
-    identity_map = AffineMapAttr.get(AffineMap.get_identity(1))
+    l1MemrefTy = l1_memref_type([tile_n], xrt_dtype_in)
+    vecTy = vec_type(VECTOR_SIZE, xrt_dtype_in)
+    imap = identity_map_attr()
 
     @FuncOp.from_py_func(l3memrefTy, l3memrefTy)
     def leaky_relu(arg0, arg1):
@@ -73,21 +68,7 @@ def herd_body(
             l1_out_data = AllocOp(l1MemrefTy, [], [])
 
             for _l_ivx in range_(0, n, tile_n * num_tiles):
-
-                offset_map = AffineMap.get(
-                    0,
-                    2,
-                    [
-                        AffineExpr.get_add(
-                            AffineSymbolExpr.get(0),
-                            AffineExpr.get_mul(
-                                AffineSymbolExpr.get(1),
-                                AffineConstantExpr.get(tile_n),
-                            ),
-                        )
-                    ],
-                )
-                offset = affine_apply(offset_map, [_l_ivx, _ty])
+                offset = tile_offset_1d(_l_ivx, _ty, tile_n)
 
                 dma_memcpy_nd(
                     l1_in_data,
@@ -107,24 +88,12 @@ def herd_body(
                 v_alpha = BroadcastOp(vecTy, alpha_const)
 
                 for j in range_(c0, cTileN, cVecSize):
-                    sub_in = subview(
-                        l1_in_data.result,
-                        [j],
-                        [VECTOR_SIZE],
-                        [1],
-                    )
-                    sub_out = subview(
-                        l1_out_data.result,
-                        [j],
-                        [VECTOR_SIZE],
-                        [1],
-                    )
-                    v_x = transfer_read(vecTy, sub_in, [c0], identity_map, cst0, [True])
+                    v_x = vec_read(l1_in_data, j, VECTOR_SIZE, c0, vecTy, cst0, imap)
                     v_alpha_x = arith.MulFOp(v_x, v_alpha)
                     # Leaky RELU: x >= 0 ? x : alpha*x
                     cmp = arith.CmpFOp(arith.CmpFPredicate.OGE, v_x, v_zero)
                     v_result = arith.SelectOp(cmp, v_x, v_alpha_x)
-                    transfer_write(None, v_result, sub_out, [c0], identity_map, [True])
+                    vec_write(v_result, l1_out_data, j, VECTOR_SIZE, c0, imap)
                     yield_([])
 
                 dma_memcpy_nd(
@@ -146,12 +115,7 @@ def herd_body(
     INPUT_DATATYPE = bfloat16
     ALPHA = 0.01
 
-    parser = argparse.ArgumentParser(
-        prog="run.py",
-        description="Builds, runs, and tests the Leaky RELU example",
-    )
-    parser.add_argument("-v", "--verbose", action="store_true")
-    parser.add_argument("-p", "--print-module-only", action="store_true")
+    parser = make_air_parser("Builds, runs, and tests the Leaky RELU example")
     parser.add_argument("--n", type=int, default=N, help="Total number of elements")
     parser.add_argument("--tile-n", type=int, default=TILE_N, help="Tile size")
     parser.add_argument(
@@ -163,20 +127,6 @@ def herd_body(
         default=16,
         help="Vector size for SIMD operations",
     )
-    parser.add_argument(
-        "--compile-mode",
-        type=str,
-        choices=["compile-only", "compile-and-run"],
-        dest="compile_mode",
-        default="compile-and-run",
-    )
-    parser.add_argument(
-        "--output-format",
-        type=str,
-        choices=["xclbin", "elf"],
-        default="xclbin",
-        dest="output_format",
-    )
 
     args = parser.parse_args()
 
@@ -190,44 +140,27 @@ def herd_body(
     # Mix of positive and negative values for Leaky RELU testing
     input_a = np.random.randn(args.n).astype(INPUT_DATATYPE)
 
-    if args.compile_mode == "compile-and-run":
-        num_samples = 100
-        sampled_indices = np.vstack([np.random.randint(0, args.n, num_samples)])
-        sampled_values = np.array(
-            [
-                np.where(input_a[i] >= 0, input_a[i], args.alpha * input_a[i])
-                for i in zip(*sampled_indices)
-            ],
-            dtype=INPUT_DATATYPE,
-        )
-        sampled_data = {
-            "shape": (args.n,),
-            "indices": sampled_indices,
-            "values": sampled_values,
-        }
-
-        runner = XRTRunner(
-            verbose=args.verbose,
-            omit_while_true_loop=False,
-            output_format=args.output_format,
+    sampled_indices = np.vstack([np.random.randint(0, args.n, 100)])
+    sampled_values = np.array(
+        [
+            np.where(input_a[i] >= 0, input_a[i], args.alpha * input_a[i])
+            for i in zip(*sampled_indices)
+        ],
+        dtype=INPUT_DATATYPE,
+    )
+    sampled_data = {
+        "shape": (args.n,),
+        "indices": sampled_indices,
+        "values": sampled_values,
+    }
+
+    exit(
+        run_on_npu(
+            args,
+            mlir_module,
+            inputs=[input_a],
             instance_name="leaky_relu",
-            runtime_loop_tiling_sizes=[4, 4],
+            stochastic_expected_outputs=[sampled_data],
+            rtol=1e-2,
         )
-        exit(
-            runner.run_test(
-                mlir_module,
-                inputs=[input_a],
-                stochastic_expected_outputs=[sampled_data],
-                rtol=1e-2,
-            )
-        )
-
-    elif args.compile_mode == "compile-only":
-        backend = XRTBackend(
-            verbose=args.verbose,
-            omit_while_true_loop=False,
-            output_format=args.output_format,
-            runtime_loop_tiling_sizes=[4, 4],
-        )
-        module_function = backend.compile(mlir_module)
-        backend.unload()
+    )
diff --git a/programming_examples/lit.cfg.py b/programming_examples/lit.cfg.py
index 7a7f86ec9..e3bf7e3ca 100644
--- a/programming_examples/lit.cfg.py
+++ b/programming_examples/lit.cfg.py
@@ -85,8 +85,8 @@
         )
         result = result.stdout.decode("utf-8").split("\n")
         # Older format is "|[0000:41:00.1]  ||RyzenAI-npu1  |"
-        # Newer format is "|[0000:41:00.1]  |NPU Phoenix  |"
-        p = re.compile(r"[\|]?(\[.+:.+:.+\]).+\|(RyzenAI-(npu\d)|NPU (\w+))\W*\|")
+        # Newer format is "|[0000:41:00.1]  |NPU Phoenix  |" or "|[0000:c6:00.1]  |NPU Strix Halo  |"
+        p = re.compile(r"[\|]?(\[.+:.+:.+\]).+\|(RyzenAI-(npu\d)|NPU ([\w ]+?))\s*\|")
         for l in result:
             m = p.match(l)
             if not m:
@@ -94,9 +94,9 @@
             print("Found Ryzen AI device:", m.group(1))
             model = "unknown"
             if m.group(3):
-                model = str(m.group(3))
+                model = str(m.group(3)).strip()
             if m.group(4):
-                model = str(m.group(4))
+                model = str(m.group(4)).strip()
             print(f"\tmodel: '{model}'")
             config.available_features.add("ryzen_ai")
             run_on_npu = (
@@ -106,10 +106,10 @@
                 run_on_npu1 = run_on_npu
                 config.available_features.add("ryzen_ai_npu1")
                 print("Running tests on NPU1 with command line: ", run_on_npu)
-            elif model in ["npu4", "Strix"]:
+            elif "Strix" in model or model in ["npu4"]:
                 run_on_npu2 = run_on_npu
                 config.available_features.add("ryzen_ai_npu2")
-                print("Running tests on NPU4 with command line: ", run_on_2npu)
+                print("Running tests on NPU2 with command line: ", run_on_npu)
             else:
                 print(f"WARNING: xrt-smi reported unknown NPU model '{model}'.")
             break
diff --git a/programming_examples/llama2_mha/mha.py b/programming_examples/llama2_mha/mha.py
index 3f2fa86c1..7dd406512 100644
--- a/programming_examples/llama2_mha/mha.py
+++ b/programming_examples/llama2_mha/mha.py
@@ -10,8 +10,7 @@
 from air.dialects.memref import AllocOp, DeallocOp, load, store
 from air.dialects.func import FuncOp, CallOp
 from air.dialects.scf import for_, yield_
-from air.backend.xrt_runner import XRTRunner, type_mapper
-from air.backend.xrt import XRTBackend
+from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu
 from ml_dtypes import bfloat16
 
 range_ = for_
@@ -867,37 +866,16 @@ def herd_body_0(_tx, _ty, _sx, _sy, c_data, out_data, pos):
         for i in range(0, args.n):
             output_xb[i] += softmax_output[t] * output_vc[t][i]
 
-    if args.compile_mode == "compile-and-run":
-        ###### Compile and test
-        runner = XRTRunner(
-            verbose=args.verbose,
-            omit_while_true_loop=False,
-            omit_pingpong=True,
-            output_format=args.output_format,
-            instance_name="mha_bf16",
+    instance_name = args.instance_name if args.instance_name else "mha_bf16"
+    exit(
+        run_on_npu(
+            args,
+            mlir_module,
+            inputs=[input_a, input_b, output_kc, output_vc],
+            instance_name=instance_name,
+            expected_outputs=[output_xb],
+            rtol=1e0,
             runtime_loop_tiling_sizes=[4, 4],
-        )
-        exit(
-            runner.run_test(
-                mlir_module,
-                inputs=[input_a, input_b, output_kc, output_vc],
-                expected_outputs=[output_xb],
-                rtol=1e0,
-            )
-        )
-
-    elif args.compile_mode == "compile-only":
-        ####### Compile only
-        backend = XRTBackend(
-            verbose=args.verbose,
-            omit_while_true_loop=False,
             omit_pingpong=True,
-            kernel_name=args.kernel_name,
-            instance_name=args.instance_name,
-            kernel_id=args.kernel_id,
-            output_format=args.output_format,
-            runtime_loop_tiling_sizes=[4, 4],
         )
-        module_function = backend.compile(mlir_module)
-
-        backend.unload()
+    )
diff --git a/programming_examples/matrix_multiplication/bf16/run.py b/programming_examples/matrix_multiplication/bf16/run.py
index 1d1475d8e..4b2926a11 100644
--- a/programming_examples/matrix_multiplication/bf16/run.py
+++ b/programming_examples/matrix_multiplication/bf16/run.py
@@ -13,8 +13,7 @@
 from air.dialects.memref import AllocOp, DeallocOp, load, store, subview
 from air.dialects.func import FuncOp
 from air.dialects.scf import for_, yield_
-from air.backend.xrt_runner import XRTRunner, type_mapper
-from air.backend.xrt import XRTBackend
+from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu
 from air.extras import types as extrasT
 from air.dialects.linalg.opdsl.lang import *
 import air.dialects.linalg.opdsl.lang as linalg_lang
@@ -716,8 +715,13 @@ def herd_body(
     input_a = (np.random.randn(args.m, args.k) * 4).astype(INPUT_DATATYPE)
     input_b = (np.random.rand(args.k, args.n) * 4).astype(INPUT_DATATYPE)
 
+    # Build common compile kwargs
+    compile_kwargs = {"runtime_loop_tiling_sizes": [2, 2]}
+    if not args.direct_codegen:
+        compile_kwargs["lower_linalg_to_func"] = "mm.o"
+
     if args.compile_mode == "compile-and-run":
-        # Stochastically sample num_sample results, and pass to XRTRunner backend for verification.
+        # Stochastically sample num_sample results, and pass to run_on_npu for verification.
         num_samples = 100
         sampled_indices = np.vstack(
             [
@@ -747,64 +751,35 @@ def herd_body(
             "values": sampled_values,
         }
 
-        ###### Compile and test
-        runner_kwargs = {
-            "verbose": args.verbose,
-            "omit_while_true_loop": False,
-            "runtime_loop_tiling_sizes": [2, 2],
-        }
-        # Only use external kernel library if NOT in direct codegen mode
-        if not args.direct_codegen:
-            runner_kwargs["lower_linalg_to_func"] = "mm.o"
-
-        runner = XRTRunner(**runner_kwargs, instance_name="matmul_bf16")
         exit(
-            runner.run_test(
+            run_on_npu(
+                args,
                 mlir_module,
                 inputs=[input_a, input_b],
+                instance_name="matmul_bf16",
                 stochastic_expected_outputs=[sampled_data],
                 rtol=0.05,
                 atol=4,
                 max_mismatch_percentage=5,
+                **compile_kwargs,
             )
         )
 
-    elif args.compile_mode == "compile-and-xclbin":
-        ###### Compile and generate xclbin (requires XRT, no execution)
-        backend_kwargs = {
-            "verbose": args.verbose,
-            "omit_while_true_loop": False,
-            "runtime_loop_tiling_sizes": [2, 2],
-        }
-        # Only use external kernel library if NOT in direct codegen mode
-        if not args.direct_codegen:
-            backend_kwargs["lower_linalg_to_func"] = "mm.o"
-
-        backend = XRTBackend(**backend_kwargs)
-        module_function = backend.compile(mlir_module)
-
-        backend.unload()
-
-    elif args.compile_mode == "compile-only":
-        ###### Compile only (without XRT dependencies)
-        # Map architecture to target device
-        target_device = "npu2" if args.arch == "aie2p" else "npu1"
-
-        backend_kwargs = {
-            "verbose": args.verbose,
-            "target_device": target_device,  # Explicit target based on arch (no xrt dependencies)
-            "output_format": "none",  # Skip xclbin generation (no xrt dependencies)
-            "omit_while_true_loop": False,
-            "runtime_loop_tiling_sizes": [2, 2],
-        }
-        # Only use external kernel library if NOT in direct codegen mode
-        if not args.direct_codegen:
-            backend_kwargs["lower_linalg_to_func"] = "mm.o"
-
-        backend = XRTBackend(**backend_kwargs)
-        module_function = backend.compile(mlir_module)
-
-        backend.unload()
-
-        print("Compilation completed successfully!")
-        sys.exit(0)
+    elif args.compile_mode in ("compile-and-xclbin", "compile-only"):
+        ###### Compile only (no execution)
+        if args.compile_mode == "compile-only":
+            # Skip xclbin generation (no xrt dependencies)
+            target_device = "npu2" if args.arch == "aie2p" else "npu1"
+            compile_kwargs["target_device"] = target_device
+            compile_kwargs["output_format"] = "none"
+        # Remap to compile-only so run_on_npu dispatches correctly
+        args.compile_mode = "compile-only"
+        exit(
+            run_on_npu(
+                args,
+                mlir_module,
+                inputs=[],
+                instance_name="matmul_bf16",
+                **compile_kwargs,
+            )
+        )
diff --git a/programming_examples/matrix_multiplication/i16/run.py b/programming_examples/matrix_multiplication/i16/run.py
index 83567e8df..2f83a2810 100644
--- a/programming_examples/matrix_multiplication/i16/run.py
+++ b/programming_examples/matrix_multiplication/i16/run.py
@@ -12,8 +12,7 @@
 from air.dialects.memref import AllocOp, DeallocOp, load, store, subview
 from air.dialects.func import FuncOp
 from air.dialects.scf import for_, yield_
-from air.backend.xrt_runner import XRTRunner, type_mapper
-from air.backend.xrt import XRTBackend
+from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu
 from air.extras import types as extrasT
 from air.dialects.linalg.opdsl.lang import *
 import air.dialects.linalg.opdsl.lang as linalg_lang
@@ -710,9 +709,14 @@ def herd_body(
     input_b = np.arange(0, args.k * args.n, dtype=np.int64).reshape(args.k, args.n) % 7
     input_b = input_b.astype(INPUT_DATATYPE)
 
+    # Build common compile kwargs
+    compile_kwargs = {"runtime_loop_tiling_sizes": [2, 2]}
+    if not args.direct_codegen:
+        compile_kwargs["lower_linalg_to_func"] = "mm.o"
+
     if args.compile_mode == "compile-and-run":
 
-        # Stochastically sample num_sample results, and pass to XRTRunner backend for verification.
+        # Stochastically sample num_sample results, and pass to run_on_npu for verification.
         num_samples = 100
         sampled_indices = np.vstack(
             [
@@ -743,61 +747,32 @@ def herd_body(
             "values": sampled_values,
         }
 
-        ###### Compile and test
-        runner_kwargs = {
-            "verbose": args.verbose,
-            "omit_while_true_loop": False,
-            "runtime_loop_tiling_sizes": [2, 2],
-        }
-        # Only use external kernel library if NOT in direct codegen mode
-        if not args.direct_codegen:
-            runner_kwargs["lower_linalg_to_func"] = "mm.o"
-
-        runner = XRTRunner(**runner_kwargs, instance_name="matmul_bf16")
         exit(
-            runner.run_test(
+            run_on_npu(
+                args,
                 mlir_module,
                 inputs=[input_a, input_b],
+                instance_name="matmul_bf16",
                 stochastic_expected_outputs=[sampled_data],
+                **compile_kwargs,
             )
         )
 
-    elif args.compile_mode == "compile-and-xclbin":
-        ###### Compile and generate xclbin (requires XRT, no execution)
-        backend_kwargs = {
-            "verbose": args.verbose,
-            "omit_while_true_loop": False,
-            "runtime_loop_tiling_sizes": [2, 2],
-        }
-        # Only use external kernel library if NOT in direct codegen mode
-        if not args.direct_codegen:
-            backend_kwargs["lower_linalg_to_func"] = "mm.o"
-
-        backend = XRTBackend(**backend_kwargs)
-        module_function = backend.compile(mlir_module)
-
-        backend.unload()
-
-    elif args.compile_mode == "compile-only":
-        ###### Compile only (without XRT dependencies)
-        # Map architecture to target device
-        target_device = "npu2" if args.arch == "aie2p" else "npu1"
-
-        backend_kwargs = {
-            "verbose": args.verbose,
-            "target_device": target_device,  # Explicit target based on arch (no xrt dependencies)
-            "output_format": "none",  # Skip xclbin generation (no xrt dependencies)
-            "omit_while_true_loop": False,
-            "runtime_loop_tiling_sizes": [2, 2],
-        }
-        # Only use external kernel library if NOT in direct codegen mode
-        if not args.direct_codegen:
-            backend_kwargs["lower_linalg_to_func"] = "mm.o"
-
-        backend = XRTBackend(**backend_kwargs)
-        module_function = backend.compile(mlir_module)
-
-        backend.unload()
-
-        print("Compilation completed successfully!")
-        sys.exit(0)
+    elif args.compile_mode in ("compile-and-xclbin", "compile-only"):
+        ###### Compile only (no execution)
+        if args.compile_mode == "compile-only":
+            # Skip xclbin generation (no xrt dependencies)
+            target_device = "npu2" if args.arch == "aie2p" else "npu1"
+            compile_kwargs["target_device"] = target_device
+            compile_kwargs["output_format"] = "none"
+        # Remap to compile-only so run_on_npu dispatches correctly
+        args.compile_mode = "compile-only"
+        exit(
+            run_on_npu(
+                args,
+                mlir_module,
+                inputs=[],
+                instance_name="matmul_bf16",
+                **compile_kwargs,
+            )
+        )
diff --git a/programming_examples/matrix_multiplication/i8/run.py b/programming_examples/matrix_multiplication/i8/run.py
index fd65144e1..fed6615de 100644
--- a/programming_examples/matrix_multiplication/i8/run.py
+++ b/programming_examples/matrix_multiplication/i8/run.py
@@ -12,8 +12,7 @@
 from air.dialects.memref import AllocOp, DeallocOp, load, store, subview
 from air.dialects.func import FuncOp
 from air.dialects.scf import for_, yield_
-from air.backend.xrt_runner import XRTRunner, type_mapper
-from air.backend.xrt import XRTBackend
+from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu
 from air.extras import types as extrasT
 from air.dialects.linalg.opdsl.lang import *
 import air.dialects.linalg.opdsl.lang as linalg_lang
@@ -710,9 +709,14 @@ def herd_body(
     input_b = np.arange(0, args.k * args.n, dtype=np.int64).reshape(args.k, args.n) % 7
     input_b = input_b.astype(INPUT_DATATYPE)
 
+    # Build common compile kwargs
+    compile_kwargs = {"runtime_loop_tiling_sizes": [2, 2]}
+    if not args.direct_codegen:
+        compile_kwargs["lower_linalg_to_func"] = "mm.o"
+
     if args.compile_mode == "compile-and-run":
 
-        # Stochastically sample num_sample results, and pass to XRTRunner backend for verification.
+        # Stochastically sample num_sample results, and pass to run_on_npu for verification.
         num_samples = 100
         sampled_indices = np.vstack(
             [
@@ -743,61 +747,32 @@ def herd_body(
             "values": sampled_values,
         }
 
-        ###### Compile and test
-        runner_kwargs = {
-            "verbose": args.verbose,
-            "omit_while_true_loop": False,
-            "runtime_loop_tiling_sizes": [2, 2],
-        }
-        # Only use external kernel library if NOT in direct codegen mode
-        if not args.direct_codegen:
-            runner_kwargs["lower_linalg_to_func"] = "mm.o"
-
-        runner = XRTRunner(**runner_kwargs, instance_name="matmul_bf16")
         exit(
-            runner.run_test(
+            run_on_npu(
+                args,
                 mlir_module,
                 inputs=[input_a, input_b],
+                instance_name="matmul_bf16",
                 stochastic_expected_outputs=[sampled_data],
+                **compile_kwargs,
             )
         )
 
-    elif args.compile_mode == "compile-and-xclbin":
-        ###### Compile and generate xclbin (requires XRT, no execution)
-        backend_kwargs = {
-            "verbose": args.verbose,
-            "omit_while_true_loop": False,
-            "runtime_loop_tiling_sizes": [2, 2],
-        }
-        # Only use external kernel library if NOT in direct codegen mode
-        if not args.direct_codegen:
-            backend_kwargs["lower_linalg_to_func"] = "mm.o"
-
-        backend = XRTBackend(**backend_kwargs)
-        module_function = backend.compile(mlir_module)
-
-        backend.unload()
-
-    elif args.compile_mode == "compile-only":
-        ###### Compile only (without XRT dependencies)
-        # Map architecture to target device
-        target_device = "npu2" if args.arch == "aie2p" else "npu1"
-
-        backend_kwargs = {
-            "verbose": args.verbose,
-            "target_device": target_device,  # Explicit target based on arch (no xrt dependencies)
-            "output_format": "none",  # Skip xclbin generation (no xrt dependencies)
-            "omit_while_true_loop": False,
-            "runtime_loop_tiling_sizes": [2, 2],
-        }
-        # Only use external kernel library if NOT in direct codegen mode
-        if not args.direct_codegen:
-            backend_kwargs["lower_linalg_to_func"] = "mm.o"
-
-        backend = XRTBackend(**backend_kwargs)
-        module_function = backend.compile(mlir_module)
-
-        backend.unload()
-
-        print("Compilation completed successfully!")
-        sys.exit(0)
+    elif args.compile_mode in ("compile-and-xclbin", "compile-only"):
+        ###### Compile only (no execution)
+        if args.compile_mode == "compile-only":
+            # Skip xclbin generation (no xrt dependencies)
+            target_device = "npu2" if args.arch == "aie2p" else "npu1"
+            compile_kwargs["target_device"] = target_device
+            compile_kwargs["output_format"] = "none"
+        # Remap to compile-only so run_on_npu dispatches correctly
+        args.compile_mode = "compile-only"
+        exit(
+            run_on_npu(
+                args,
+                mlir_module,
+                inputs=[],
+                instance_name="matmul_bf16",
+                **compile_kwargs,
+            )
+        )
diff --git a/programming_examples/matrix_scalar_add/multi_core_channel/multi_core_channel.py b/programming_examples/matrix_scalar_add/multi_core_channel/multi_core_channel.py
index 950f33a92..2ff7e1dfc 100644
--- a/programming_examples/matrix_scalar_add/multi_core_channel/multi_core_channel.py
+++ b/programming_examples/matrix_scalar_add/multi_core_channel/multi_core_channel.py
@@ -1,13 +1,11 @@
 # Copyright (C) 2024, Advanced Micro Devices, Inc.
 # SPDX-License-Identifier: MIT
-import argparse
-
 from air.ir import *
 from air.dialects.air import *
 from air.dialects.memref import AllocOp, DeallocOp, load, store
 from air.dialects.func import FuncOp
 from air.dialects.scf import for_, yield_
-from air.backend.xrt_runner import XRTRunner, type_mapper
+from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu
 
 range_ = for_
 
@@ -80,15 +78,8 @@ def segment_body():
 
                         @herd(name=format_name("xaddherd", h, w), sizes=[1, 1])
                         def herd_body(_tx, _ty, _sx, _sy):
-                            # We want to store our data in L1 memory
-                            mem_space = IntegerAttr.get(T.i32(), MemorySpace.L1)
-
                             # This is the type definition of the tile
-                            tile_type = MemRefType.get(
-                                shape=tile_size,
-                                element_type=xrt_dtype,
-                                memory_space=mem_space,
-                            )
+                            tile_type = l1_memref_type(tile_size, xrt_dtype)
 
                             # We must allocate a buffer of tile size for the input/output
                             tile_in = AllocOp(tile_type, [], [])
@@ -133,20 +124,7 @@ def herd_body(_tx, _ty, _sx, _sy):
     TILE_HEIGHT = 16
     INOUT_DATATYPE = np.int32
 
-    parser = argparse.ArgumentParser(
-        prog="run.py",
-        description="Builds, runs, and tests the passthrough_dma example",
-    )
-    parser.add_argument(
-        "-v",
-        "--verbose",
-        action="store_true",
-    )
-    parser.add_argument(
-        "-p",
-        "--print-module-only",
-        action="store_true",
-    )
+    parser = make_air_parser("Builds, runs, and tests the passthrough_dma example")
     parser.add_argument(
         "--image-height",
         type=int,
@@ -162,14 +140,6 @@ def herd_body(_tx, _ty, _sx, _sy):
     parser.add_argument(
         "--tile-width", type=int, default=TILE_WIDTH, help="Width of the tile data"
     )
-    parser.add_argument(
-        "--output-format",
-        type=str,
-        choices=["xclbin", "elf"],
-        default="xclbin",
-        dest="output_format",
-        help="Output format for the compiled binary (default: xclbin)",
-    )
 
     args = parser.parse_args()
 
@@ -199,10 +169,11 @@ def herd_body(_tx, _ty, _sx, _sy):
             )
             output_b[i, j] = input_a[i, j] + tile_num
 
-    runner = XRTRunner(
-        verbose=args.verbose,
-        output_format=args.output_format,
+    run_on_npu(
+        args,
+        mlir_module,
+        inputs=[input_a],
+        expected_outputs=[output_b],
         instance_name="copy",
         runtime_loop_tiling_sizes=[4, 4],
     )
-    exit(runner.run_test(mlir_module, inputs=[input_a], expected_outputs=[output_b]))
diff --git a/programming_examples/matrix_scalar_add/multi_core_dma/multi_core_dma.py b/programming_examples/matrix_scalar_add/multi_core_dma/multi_core_dma.py
index b33a3c7a5..1930fd7d9 100644
--- a/programming_examples/matrix_scalar_add/multi_core_dma/multi_core_dma.py
+++ b/programming_examples/matrix_scalar_add/multi_core_dma/multi_core_dma.py
@@ -1,14 +1,11 @@
 # Copyright (C) 2024, Advanced Micro Devices, Inc.
 # SPDX-License-Identifier: MIT
-import argparse
-
 from air.ir import *
 from air.dialects.air import *
 from air.dialects.memref import AllocOp, DeallocOp, load, store
 from air.dialects.func import FuncOp
 from air.dialects.scf import for_, yield_
-from air.dialects.affine import apply as affine_apply
-from air.backend.xrt_runner import XRTRunner, type_mapper
+from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu
 
 range_ = for_
 
@@ -43,62 +40,16 @@ def segment_body(arg2, arg3):
                     operands=[arg2, arg3],
                 )
                 def herd_body(tx, ty, _sx, _sy, a, b):
-                    scaled_index_map_height = AffineMap.get(
-                        0,
-                        1,
-                        [
-                            AffineExpr.get_mul(
-                                AffineSymbolExpr.get(0),
-                                AffineConstantExpr.get(tile_height),
-                            )
-                        ],
-                    )
-                    scaled_index_map_width = AffineMap.get(
-                        0,
-                        1,
-                        [
-                            AffineExpr.get_mul(
-                                AffineSymbolExpr.get(0),
-                                AffineConstantExpr.get(tile_width),
-                            )
-                        ],
-                    )
-                    create_tile_index_height = AffineMap.get(
-                        0,
-                        1,
-                        [
-                            AffineExpr.get_mul(
-                                AffineSymbolExpr.get(0),
-                                AffineConstantExpr.get(image_width // tile_width),
-                            )
-                        ],
-                    )
-                    create_tile_index = AffineMap.get(
-                        0,
-                        2,
-                        [
-                            AffineExpr.get_add(
-                                AffineSymbolExpr.get(0),
-                                AffineSymbolExpr.get(1),
-                            )
-                        ],
-                    )
-                    offset0 = affine_apply(scaled_index_map_height, [tx])
-                    offset1 = affine_apply(scaled_index_map_width, [ty])
-                    tile_index_height = affine_apply(create_tile_index_height, [tx])
-                    compute_tile_id = affine_apply(
-                        create_tile_index, [tile_index_height, ty]
+                    offset0 = tile_offset_1d(tx, 0, tile_height)
+                    offset1 = tile_offset_1d(ty, 0, tile_width)
+                    tile_index_height = arith.muli(
+                        tx,
+                        arith.ConstantOp.create_index(image_width // tile_width),
                     )
-
-                    # We want to store our data in L1 memory
-                    mem_space = IntegerAttr.get(T.i32(), MemorySpace.L1)
+                    compute_tile_id = arith.addi(tile_index_height, ty)
 
                     # This is the type definition of the tile
-                    tile_type = MemRefType.get(
-                        shape=tile_size,
-                        element_type=T.i32(),
-                        memory_space=mem_space,
-                    )
+                    tile_type = l1_memref_type(tile_size, T.i32())
 
                     # We must allocate a buffer of tile size for the input/output
                     tile_in = AllocOp(tile_type, [], [])
@@ -151,20 +102,7 @@ def herd_body(tx, ty, _sx, _sy, a, b):
     TILE_HEIGHT = 16
     INOUT_DATATYPE = np.int32
 
-    parser = argparse.ArgumentParser(
-        prog="run.py",
-        description="Builds, runs, and tests the passthrough_dma example",
-    )
-    parser.add_argument(
-        "-v",
-        "--verbose",
-        action="store_true",
-    )
-    parser.add_argument(
-        "-p",
-        "--print-module-only",
-        action="store_true",
-    )
+    parser = make_air_parser("Builds, runs, and tests the passthrough_dma example")
     parser.add_argument(
         "--image-height",
         type=int,
@@ -180,14 +118,6 @@ def herd_body(tx, ty, _sx, _sy, a, b):
     parser.add_argument(
         "--tile-width", type=int, default=TILE_WIDTH, help="Width of the tile data"
     )
-    parser.add_argument(
-        "--output-format",
-        type=str,
-        choices=["xclbin", "elf"],
-        default="xclbin",
-        dest="output_format",
-        help="Output format for the compiled binary (default: xclbin)",
-    )
 
     args = parser.parse_args()
 
@@ -217,10 +147,11 @@ def herd_body(tx, ty, _sx, _sy, a, b):
             )
             output_b[i, j] = input_a[i, j] + tile_num
 
-    runner = XRTRunner(
-        verbose=args.verbose,
-        output_format=args.output_format,
+    run_on_npu(
+        args,
+        mlir_module,
+        inputs=[input_a],
+        expected_outputs=[output_b],
         instance_name="copy",
         runtime_loop_tiling_sizes=[4, 4],
     )
-    exit(runner.run_test(mlir_module, inputs=[input_a], expected_outputs=[output_b]))
diff --git a/programming_examples/matrix_scalar_add/multi_launch_channel/multi_launch_channel.py b/programming_examples/matrix_scalar_add/multi_launch_channel/multi_launch_channel.py
index c70a2048d..712796dea 100644
--- a/programming_examples/matrix_scalar_add/multi_launch_channel/multi_launch_channel.py
+++ b/programming_examples/matrix_scalar_add/multi_launch_channel/multi_launch_channel.py
@@ -1,13 +1,11 @@
 # Copyright (C) 2024, Advanced Micro Devices, Inc.
 # SPDX-License-Identifier: MIT
-import argparse
-
 from air.ir import *
 from air.dialects.air import *
 from air.dialects.memref import AllocOp, DeallocOp, load, store
 from air.dialects.func import FuncOp
 from air.dialects.scf import for_, yield_
-from air.backend.xrt_runner import XRTRunner, type_mapper
+from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu
 
 range_ = for_
 
@@ -71,15 +69,8 @@ def segment_body():
 
                         @herd(name=format_name("xaddherd", h, w), sizes=[1, 1])
                         def herd_body(_tx, _ty, _sx, _sy):
-                            # We want to store our data in L1 memory
-                            mem_space = IntegerAttr.get(T.i32(), MemorySpace.L1)
-
                             # This is the type definition of the tile
-                            tile_type = MemRefType.get(
-                                shape=tile_size,
-                                element_type=xrt_dtype,
-                                memory_space=mem_space,
-                            )
+                            tile_type = l1_memref_type(tile_size, xrt_dtype)
 
                             # We must allocate a buffer of tile size for the input/output
                             tile_in = AllocOp(tile_type, [], [])
@@ -124,20 +115,7 @@ def herd_body(_tx, _ty, _sx, _sy):
     TILE_HEIGHT = 16
     INOUT_DATATYPE = np.int32
 
-    parser = argparse.ArgumentParser(
-        prog="run.py",
-        description="Builds, runs, and tests the passthrough_dma example",
-    )
-    parser.add_argument(
-        "-v",
-        "--verbose",
-        action="store_true",
-    )
-    parser.add_argument(
-        "-p",
-        "--print-module-only",
-        action="store_true",
-    )
+    parser = make_air_parser("Builds, runs, and tests the passthrough_dma example")
     parser.add_argument(
         "--image-height",
         type=int,
@@ -153,14 +131,6 @@ def herd_body(_tx, _ty, _sx, _sy):
     parser.add_argument(
         "--tile-width", type=int, default=TILE_WIDTH, help="Width of the tile data"
     )
-    parser.add_argument(
-        "--output-format",
-        type=str,
-        choices=["xclbin", "elf"],
-        default="xclbin",
-        dest="output_format",
-        help="Output format for the compiled binary (default: xclbin)",
-    )
 
     args = parser.parse_args()
 
@@ -190,10 +160,11 @@ def herd_body(_tx, _ty, _sx, _sy):
             )
             output_b[i, j] = input_a[i, j] + tile_num
 
-    runner = XRTRunner(
-        verbose=args.verbose,
-        output_format=args.output_format,
+    run_on_npu(
+        args,
+        mlir_module,
+        inputs=[input_a],
+        expected_outputs=[output_b],
         instance_name="copy",
         runtime_loop_tiling_sizes=[4, 4],
     )
-    exit(runner.run_test(mlir_module, inputs=[input_a], expected_outputs=[output_b]))
diff --git a/programming_examples/matrix_scalar_add/single_core_channel/single_core_channel.py b/programming_examples/matrix_scalar_add/single_core_channel/single_core_channel.py
index 6c4c1c553..daad63ab8 100644
--- a/programming_examples/matrix_scalar_add/single_core_channel/single_core_channel.py
+++ b/programming_examples/matrix_scalar_add/single_core_channel/single_core_channel.py
@@ -1,13 +1,11 @@
 # Copyright (C) 2024, Advanced Micro Devices, Inc.
 # SPDX-License-Identifier: MIT
-import argparse
-
 from air.ir import *
 from air.dialects.air import *
 from air.dialects.memref import AllocOp, DeallocOp, load, store
 from air.dialects.func import FuncOp
 from air.dialects.scf import for_, yield_
-from air.backend.xrt_runner import XRTRunner, type_mapper
+from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu
 
 range_ = for_
 
@@ -71,15 +69,8 @@ def segment_body():
                 @herd(name="xaddherd", sizes=[1, 1])
                 def herd_body(_tx, _ty, _sx, _sy):
 
-                    # We want to store our data in L1 memory
-                    mem_space = IntegerAttr.get(T.i32(), MemorySpace.L1)
-
                     # This is the type definition of the tile
-                    tile_type = MemRefType.get(
-                        shape=tile_size,
-                        element_type=xrt_dtype,
-                        memory_space=mem_space,
-                    )
+                    tile_type = l1_memref_type(tile_size, xrt_dtype)
 
                     # Loop over columns and rows of tiles
                     for tile_num in range_(
@@ -127,20 +118,7 @@ def herd_body(_tx, _ty, _sx, _sy):
     TILE_HEIGHT = 16
     INOUT_DATATYPE = np.int32
 
-    parser = argparse.ArgumentParser(
-        prog="run.py",
-        description="Builds, runs, and tests the passthrough_dma example",
-    )
-    parser.add_argument(
-        "-v",
-        "--verbose",
-        action="store_true",
-    )
-    parser.add_argument(
-        "-p",
-        "--print-module-only",
-        action="store_true",
-    )
+    parser = make_air_parser("Builds, runs, and tests the passthrough_dma example")
     parser.add_argument(
         "--image-height",
         type=int,
@@ -156,14 +134,6 @@ def herd_body(_tx, _ty, _sx, _sy):
     parser.add_argument(
         "--tile-width", type=int, default=TILE_WIDTH, help="Width of the tile data"
     )
-    parser.add_argument(
-        "--output-format",
-        type=str,
-        choices=["xclbin", "elf"],
-        default="xclbin",
-        dest="output_format",
-        help="Output format for the compiled binary (default: xclbin)",
-    )
 
     args = parser.parse_args()
 
@@ -193,10 +163,11 @@ def herd_body(_tx, _ty, _sx, _sy):
             )
             output_b[i, j] = input_a[i, j] + tile_num
 
-    runner = XRTRunner(
-        verbose=args.verbose,
-        output_format=args.output_format,
+    run_on_npu(
+        args,
+        mlir_module,
+        inputs=[input_a],
+        expected_outputs=[output_b],
         instance_name="copy",
         runtime_loop_tiling_sizes=[4, 4],
     )
-    exit(runner.run_test(mlir_module, inputs=[input_a], expected_outputs=[output_b]))
diff --git a/programming_examples/matrix_scalar_add/single_core_dma/single_core_dma.py b/programming_examples/matrix_scalar_add/single_core_dma/single_core_dma.py
index 7c72ba371..ca29409d4 100644
--- a/programming_examples/matrix_scalar_add/single_core_dma/single_core_dma.py
+++ b/programming_examples/matrix_scalar_add/single_core_dma/single_core_dma.py
@@ -1,13 +1,11 @@
 # Copyright (C) 2024, Advanced Micro Devices, Inc.
 # SPDX-License-Identifier: MIT
-import argparse
-
 from air.ir import *
 from air.dialects.air import *
 from air.dialects.memref import AllocOp, DeallocOp, load, store
 from air.dialects.func import FuncOp
 from air.dialects.scf import for_, yield_
-from air.backend.xrt_runner import XRTRunner, type_mapper
+from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu
 
 range_ = for_
 
@@ -38,15 +36,8 @@ def segment_body(arg2, arg3):
                 # We just need one compute core, so we ask for a 1x1 herd
                 @herd(name="xaddherd", sizes=[1, 1], operands=[arg2, arg3])
                 def herd_body(_tx, _ty, _sx, _sy, a, b):
-                    # We want to store our data in L1 memory
-                    mem_space = IntegerAttr.get(T.i32(), MemorySpace.L1)
-
                     # This is the type definition of the tile
-                    tile_type = MemRefType.get(
-                        shape=tile_size,
-                        element_type=T.i32(),
-                        memory_space=mem_space,
-                    )
+                    tile_type = l1_memref_type(tile_size, T.i32())
 
                     # Loop over columns and rows of tiles
                     for tile_index0 in range_(image_height // tile_height):
@@ -121,20 +112,7 @@ def herd_body(_tx, _ty, _sx, _sy, a, b):
     TILE_HEIGHT = 16
     INOUT_DATATYPE = np.int32
 
-    parser = argparse.ArgumentParser(
-        prog="run.py",
-        description="Builds, runs, and tests the passthrough_dma example",
-    )
-    parser.add_argument(
-        "-v",
-        "--verbose",
-        action="store_true",
-    )
-    parser.add_argument(
-        "-p",
-        "--print-module-only",
-        action="store_true",
-    )
+    parser = make_air_parser("Builds, runs, and tests the passthrough_dma example")
     parser.add_argument(
         "--image-height",
         type=int,
@@ -150,14 +128,6 @@ def herd_body(_tx, _ty, _sx, _sy, a, b):
     parser.add_argument(
         "--tile-width", type=int, default=TILE_WIDTH, help="Width of the tile data"
     )
-    parser.add_argument(
-        "--output-format",
-        type=str,
-        choices=["xclbin", "elf"],
-        default="xclbin",
-        dest="output_format",
-        help="Output format for the compiled binary (default: xclbin)",
-    )
 
     args = parser.parse_args()
 
@@ -187,10 +157,11 @@ def herd_body(_tx, _ty, _sx, _sy, a, b):
             )
             output_b[i, j] = input_a[i, j] + tile_num
 
-    runner = XRTRunner(
-        verbose=args.verbose,
-        output_format=args.output_format,
+    run_on_npu(
+        args,
+        mlir_module,
+        inputs=[input_a],
+        expected_outputs=[output_b],
         instance_name="copy",
         runtime_loop_tiling_sizes=[4, 4],
     )
-    exit(runner.run_test(mlir_module, inputs=[input_a], expected_outputs=[output_b]))
diff --git a/programming_examples/matrix_vector_multiplication/bf16/matvec.py b/programming_examples/matrix_vector_multiplication/bf16/matvec.py
index 30bfd232a..e4c6f05e1 100644
--- a/programming_examples/matrix_vector_multiplication/bf16/matvec.py
+++ b/programming_examples/matrix_vector_multiplication/bf16/matvec.py
@@ -25,8 +25,7 @@
 from air.dialects.memref import AllocOp, DeallocOp
 from air.dialects.func import FuncOp, CallOp
 from air.dialects.scf import for_, yield_
-from air.backend.xrt_runner import XRTRunner, type_mapper
-from air.backend.xrt import XRTBackend
+from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu
 
 range_ = for_
 
@@ -376,28 +375,28 @@ def herd_body(_tx, _ty, _sx, _sy, _l1_c, _l2_c):
             input_a.astype(np.float32), input_b.astype(np.float32)
         ).astype(OUTPUT_DATATYPE)
 
-        runner = XRTRunner(
-            verbose=args.verbose,
-            omit_while_true_loop=False,
-            runtime_loop_tiling_sizes=[4, 4],
-            output_format=args.output_format,
-            instance_name="matvec_bf16",
-        )
         exit(
-            runner.run_test(
+            run_on_npu(
+                args,
                 mlir_module,
                 inputs=[input_a, input_b],
+                instance_name="matvec_bf16",
                 expected_outputs=[output_c],
                 rtol=0.04,
                 atol=1e-3,
+                runtime_loop_tiling_sizes=[4, 4],
             )
         )
 
     elif args.compile_mode == "compile-and-xclbin":
-        backend = XRTBackend(
-            verbose=args.verbose,
-            omit_while_true_loop=False,
-            runtime_loop_tiling_sizes=[4, 4],
+        # Remap to compile-only so run_on_npu dispatches correctly
+        args.compile_mode = "compile-only"
+        exit(
+            run_on_npu(
+                args,
+                mlir_module,
+                inputs=[],
+                instance_name="matvec_bf16",
+                runtime_loop_tiling_sizes=[4, 4],
+            )
         )
-        module_function = backend.compile(mlir_module)
-        backend.unload()
diff --git a/programming_examples/mnist_fc/argmax/run.py b/programming_examples/mnist_fc/argmax/run.py
index 4b9b396ad..7d33ce2dd 100644
--- a/programming_examples/mnist_fc/argmax/run.py
+++ b/programming_examples/mnist_fc/argmax/run.py
@@ -25,8 +25,7 @@
 from air.dialects.memref import AllocOp, DeallocOp, subview, load, store
 from air.dialects.func import FuncOp
 from air.dialects.scf import for_, yield_
-from air.backend.xrt_runner import XRTRunner, type_mapper
-from air.backend.xrt import XRTBackend
+from air.backend.xrt_runner import type_mapper, run_on_npu
 from air.extras import types as extrasT
 
 np.random.seed(42)
@@ -197,6 +196,13 @@ def herd_body(tx, ty, _sx, _sy, _loff_row, _l3_a, _l3_out):
         dest="compile_mode",
         default="compile-and-run",
     )
+    parser.add_argument(
+        "--output-format",
+        type=str,
+        choices=["xclbin", "elf"],
+        default="xclbin",
+        dest="output_format",
+    )
 
     args = parser.parse_args()
 
@@ -258,29 +264,26 @@ def herd_body(tx, ty, _sx, _sy, _loff_row, _l3_a, _l3_out):
             "values": sampled_values,
         }
 
-        runner = XRTRunner(
-            verbose=args.verbose,
-            omit_while_true_loop=False,
-            output_format="xclbin",
-            instance_name="argmax",
-            runtime_loop_tiling_sizes=[4, 4],
-        )
         exit(
-            runner.run_test(
+            run_on_npu(
+                args,
                 mlir_module,
                 inputs=[input_a],
+                instance_name="argmax",
                 stochastic_expected_outputs=[sampled_data],
                 rtol=0,
                 atol=0,
+                runtime_loop_tiling_sizes=[4, 4],
             )
         )
 
     elif args.compile_mode == "compile-only":
-        backend = XRTBackend(
-            verbose=args.verbose,
-            omit_while_true_loop=False,
-            output_format="xclbin",
-            runtime_loop_tiling_sizes=[4, 4],
+        exit(
+            run_on_npu(
+                args,
+                mlir_module,
+                inputs=[],
+                instance_name="argmax",
+                runtime_loop_tiling_sizes=[4, 4],
+            )
         )
-        module_function = backend.compile(mlir_module)
-        backend.unload()
diff --git a/programming_examples/mnist_fc/broadcast_bias_add/run.py b/programming_examples/mnist_fc/broadcast_bias_add/run.py
index dfdba1b2c..56d59c149 100644
--- a/programming_examples/mnist_fc/broadcast_bias_add/run.py
+++ b/programming_examples/mnist_fc/broadcast_bias_add/run.py
@@ -28,8 +28,7 @@
 from air.dialects.vector import transfer_read, transfer_write, BroadcastOp
 from air.dialects.func import FuncOp
 from air.dialects.scf import for_, yield_
-from air.backend.xrt_runner import XRTRunner, type_mapper
-from air.backend.xrt import XRTBackend
+from air.backend.xrt_runner import type_mapper, run_on_npu
 from air.extras import types as extrasT
 
 np.random.seed(42)
@@ -311,6 +310,9 @@ def herd_body(
     input_bias = np.zeros(N_padded, dtype=np.float32)
     input_bias[:N_actual] = (np.random.randn(N_actual) * 2).astype(np.float32)
 
+    # Set output_format based on padding requirements
+    args.output_format = "elf" if needs_padding else "xclbin"
+
     if args.compile_mode == "compile-and-run":
         # Golden: C[row,col] = A[row,col] + bias[col]
         num_samples = 100
@@ -355,28 +357,25 @@ def herd_body(
             "values": sampled_values,
         }
 
-        runner = XRTRunner(
-            verbose=args.verbose,
-            omit_while_true_loop=False,
-            output_format="elf" if needs_padding else "xclbin",
-            instance_name="broadcast_bias_add",
-            runtime_loop_tiling_sizes=[4, 4],
-        )
         exit(
-            runner.run_test(
+            run_on_npu(
+                args,
                 mlir_module,
                 inputs=[input_a, input_bias],
+                instance_name="broadcast_bias_add",
                 stochastic_expected_outputs=[sampled_data],
                 rtol=1e-6,
+                runtime_loop_tiling_sizes=[4, 4],
             )
         )
 
     elif args.compile_mode == "compile-only":
-        backend = XRTBackend(
-            verbose=args.verbose,
-            omit_while_true_loop=False,
-            output_format="elf" if needs_padding else "xclbin",
-            runtime_loop_tiling_sizes=[4, 4],
+        exit(
+            run_on_npu(
+                args,
+                mlir_module,
+                inputs=[],
+                instance_name="broadcast_bias_add",
+                runtime_loop_tiling_sizes=[4, 4],
+            )
         )
-        module_function = backend.compile(mlir_module)
-        backend.unload()
diff --git a/programming_examples/mnist_fc/integration/run.py b/programming_examples/mnist_fc/integration/run.py
index b5673d8aa..92acc2421 100644
--- a/programming_examples/mnist_fc/integration/run.py
+++ b/programming_examples/mnist_fc/integration/run.py
@@ -35,8 +35,7 @@
 from air.dialects.vector import transfer_read, transfer_write, BroadcastOp
 from air.dialects.func import FuncOp
 from air.dialects.scf import for_, yield_
-from air.backend.xrt_runner import XRTRunner, type_mapper
-from air.backend.xrt import XRTBackend
+from air.backend.xrt_runner import type_mapper, run_on_npu
 from air.compiler.util import run_transform
 from air.extras import types as extrasT
 
@@ -696,6 +695,13 @@ def _extend_with_elementwise(
         dest="compile_mode",
         default="compile-and-run",
     )
+    parser.add_argument(
+        "--output-format",
+        type=str,
+        choices=["xclbin", "elf"],
+        default="elf",
+        dest="output_format",
+    )
     args = parser.parse_args()
 
     # aie2p mmul dimensions
@@ -811,15 +817,9 @@ def _extend_with_elementwise(
             "values": golden_argmax_pad[sampled_cols],
         }
 
-        runner = XRTRunner(
-            verbose=args.verbose,
-            omit_while_true_loop=False,
-            output_format="elf",
-            instance_name="mnist_fc",
-            runtime_loop_tiling_sizes=[1, 1],
-        )
         exit(
-            runner.run_test(
+            run_on_npu(
+                args,
                 mlir_module,
                 inputs=[
                     W1,
@@ -831,18 +831,21 @@ def _extend_with_elementwise(
                     bias2,
                     bias2_out,
                 ],
+                instance_name="mnist_fc",
                 stochastic_expected_outputs=[sampled_data],
                 rtol=0,
                 atol=0,
+                runtime_loop_tiling_sizes=[1, 1],
             )
         )
 
     elif args.compile_mode == "compile-only":
-        backend = XRTBackend(
-            verbose=args.verbose,
-            omit_while_true_loop=False,
-            output_format="elf",
-            runtime_loop_tiling_sizes=[1, 1],
+        exit(
+            run_on_npu(
+                args,
+                mlir_module,
+                inputs=[],
+                instance_name="mnist_fc",
+                runtime_loop_tiling_sizes=[1, 1],
+            )
         )
-        module_function = backend.compile(mlir_module)
-        backend.unload()
diff --git a/programming_examples/mnist_fc/relu/run.py b/programming_examples/mnist_fc/relu/run.py
index 08ea4d01d..2cdcf2eb7 100644
--- a/programming_examples/mnist_fc/relu/run.py
+++ b/programming_examples/mnist_fc/relu/run.py
@@ -24,8 +24,7 @@
 from air.dialects.vector import transfer_read, transfer_write, BroadcastOp
 from air.dialects.func import FuncOp
 from air.dialects.scf import for_, yield_
-from air.backend.xrt_runner import XRTRunner, type_mapper
-from air.backend.xrt import XRTBackend
+from air.backend.xrt_runner import type_mapper, run_on_npu
 from air.extras import types as extrasT
 
 np.random.seed(42)
@@ -335,6 +334,9 @@ def herd_body(tx, ty, _sx, _sy, _loff_m, _loff_n, _l3_a, _l3_out):
         np.float32
     )
 
+    # Set output_format based on padding requirements
+    args.output_format = "elf" if needs_padding else "xclbin"
+
     if args.compile_mode == "compile-and-run":
         # Golden reference: max(x, 0)
         num_samples = 100
@@ -381,29 +383,26 @@ def herd_body(tx, ty, _sx, _sy, _loff_m, _loff_n, _l3_a, _l3_out):
             "values": sampled_values,
         }
 
-        runner = XRTRunner(
-            verbose=args.verbose,
-            omit_while_true_loop=False,
-            output_format="elf" if needs_padding else "xclbin",
-            instance_name="relu",
-            runtime_loop_tiling_sizes=[4, 4],
-        )
         # bf16 truncation introduces rounding; use bf16-appropriate tolerance
         exit(
-            runner.run_test(
+            run_on_npu(
+                args,
                 mlir_module,
                 inputs=[input_a],
+                instance_name="relu",
                 stochastic_expected_outputs=[sampled_data],
                 rtol=1e-2,
+                runtime_loop_tiling_sizes=[4, 4],
             )
         )
 
     elif args.compile_mode == "compile-only":
-        backend = XRTBackend(
-            verbose=args.verbose,
-            omit_while_true_loop=False,
-            output_format="elf" if needs_padding else "xclbin",
-            runtime_loop_tiling_sizes=[4, 4],
+        exit(
+            run_on_npu(
+                args,
+                mlir_module,
+                inputs=[],
+                instance_name="relu",
+                runtime_loop_tiling_sizes=[4, 4],
+            )
         )
-        module_function = backend.compile(mlir_module)
-        backend.unload()
diff --git a/programming_examples/multi_segment/multi_segment_channel/multi_segment.py b/programming_examples/multi_segment/multi_segment_channel/multi_segment.py
index af2eea2fe..8fae089ae 100644
--- a/programming_examples/multi_segment/multi_segment_channel/multi_segment.py
+++ b/programming_examples/multi_segment/multi_segment_channel/multi_segment.py
@@ -1,6 +1,5 @@
 # Copyright (C) 2024, Advanced Micro Devices, Inc.
 # SPDX-License-Identifier: MIT
-import argparse
 import numpy as np
 
 from air.ir import *
@@ -8,7 +7,7 @@
 from air.dialects.memref import AllocOp, DeallocOp, load, store
 from air.dialects.func import FuncOp
 from air.dialects.scf import for_, yield_
-from air.backend.xrt_runner import XRTRunner, type_mapper
+from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu
 
 range_ = for_
 
@@ -21,15 +20,8 @@ def build_module():
     xrt_dtype = type_mapper(INOUT_DATATYPE)
     memrefTyInOut = T.memref(VECTOR_LEN, xrt_dtype)
 
-    # We want to store our data in L1 memory
-    mem_space_l1 = IntegerAttr.get(T.i32(), MemorySpace.L1)
-
     # This is the type definition of the tile
-    image_type_l1 = MemRefType.get(
-        shape=[VECTOR_LEN],
-        element_type=xrt_dtype,
-        memory_space=mem_space_l1,
-    )
+    image_type_l1 = l1_memref_type([VECTOR_LEN], xrt_dtype)
 
     Channel("ChanInA")
     Channel("ChanInB")
@@ -94,27 +86,8 @@ def herd_body(tx, ty, sx, sy):
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        prog="run.py",
-        description="Builds, runs, and tests the multi segment channel example",
-    )
-    parser.add_argument(
-        "-v",
-        "--verbose",
-        action="store_true",
-    )
-    parser.add_argument(
-        "-p",
-        "--print-module-only",
-        action="store_true",
-    )
-    parser.add_argument(
-        "--output-format",
-        type=str,
-        choices=["xclbin", "elf"],
-        default="xclbin",
-        dest="output_format",
-        help="Output format for the compiled binary (default: xclbin)",
+    parser = make_air_parser(
+        "Builds, runs, and tests the multi segment channel example"
     )
 
     args = parser.parse_args()
@@ -129,16 +102,12 @@ def herd_body(tx, ty, sx, sy):
     output_c = np.full(VECTOR_LEN, 5, dtype=INOUT_DATATYPE)
     output_d = np.full(VECTOR_LEN, 13, dtype=INOUT_DATATYPE)
 
-    runner = XRTRunner(
-        verbose=args.verbose,
-        output_format=args.output_format,
-        instance_name="copy",
-        runtime_loop_tiling_sizes=[4, 4],
-    )
     exit(
-        runner.run_test(
+        run_on_npu(
+            args,
             mlir_module,
             inputs=[input_a, input_b],
+            instance_name="copy",
             expected_outputs=[output_c, output_d],
         )
     )
diff --git a/programming_examples/multi_segment/multi_segment_dma/multi_segment.py b/programming_examples/multi_segment/multi_segment_dma/multi_segment.py
index 058a77576..f8f33cbe3 100644
--- a/programming_examples/multi_segment/multi_segment_dma/multi_segment.py
+++ b/programming_examples/multi_segment/multi_segment_dma/multi_segment.py
@@ -1,6 +1,5 @@
 # Copyright (C) 2024, Advanced Micro Devices, Inc.
 # SPDX-License-Identifier: MIT
-import argparse
 import numpy as np
 
 from air.ir import *
@@ -8,7 +7,7 @@
 from air.dialects.memref import AllocOp, DeallocOp, load, store
 from air.dialects.func import FuncOp
 from air.dialects.scf import for_, yield_
-from air.backend.xrt_runner import XRTRunner, type_mapper
+from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu
 
 range_ = for_
 
@@ -21,15 +20,8 @@ def build_module():
     xrt_dtype = type_mapper(INOUT_DATATYPE)
     memrefTyInOut = T.memref(VECTOR_LEN, xrt_dtype)
 
-    # We want to store our data in L1 memory
-    mem_space_l1 = IntegerAttr.get(T.i32(), MemorySpace.L1)
-
     # This is the type definition of the tile
-    image_type_l1 = MemRefType.get(
-        shape=[VECTOR_LEN],
-        element_type=xrt_dtype,
-        memory_space=mem_space_l1,
-    )
+    image_type_l1 = l1_memref_type([VECTOR_LEN], xrt_dtype)
 
     # We will send an image worth of data in and out
     @FuncOp.from_py_func(memrefTyInOut, memrefTyInOut, memrefTyInOut, memrefTyInOut)
@@ -86,28 +78,7 @@ def herd_body(tx, ty, sx, sy, b, d):
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        prog="run.py",
-        description="Builds, runs, and tests the multi segment dma example",
-    )
-    parser.add_argument(
-        "-v",
-        "--verbose",
-        action="store_true",
-    )
-    parser.add_argument(
-        "-p",
-        "--print-module-only",
-        action="store_true",
-    )
-    parser.add_argument(
-        "--output-format",
-        type=str,
-        choices=["xclbin", "elf"],
-        default="xclbin",
-        dest="output_format",
-        help="Output format for the compiled binary (default: xclbin)",
-    )
+    parser = make_air_parser("Builds, runs, and tests the multi segment dma example")
 
     args = parser.parse_args()
 
@@ -121,16 +92,12 @@ def herd_body(tx, ty, sx, sy, b, d):
     output_c = np.full(VECTOR_LEN, 5, dtype=INOUT_DATATYPE)
     output_d = np.full(VECTOR_LEN, 13, dtype=INOUT_DATATYPE)
 
-    runner = XRTRunner(
-        verbose=args.verbose,
-        output_format=args.output_format,
-        instance_name="copy",
-        runtime_loop_tiling_sizes=[4, 4],
-    )
     exit(
-        runner.run_test(
+        run_on_npu(
+            args,
             mlir_module,
             inputs=[input_a, input_b],
+            instance_name="copy",
             expected_outputs=[output_c, output_d],
         )
     )
diff --git a/programming_examples/passthrough/passthrough_channel/passthrough_channel.py b/programming_examples/passthrough/passthrough_channel/passthrough_channel.py
index b4c4c52c1..7c43b645c 100644
--- a/programming_examples/passthrough/passthrough_channel/passthrough_channel.py
+++ b/programming_examples/passthrough/passthrough_channel/passthrough_channel.py
@@ -1,6 +1,5 @@
 # Copyright (C) 2024, Advanced Micro Devices, Inc.
 # SPDX-License-Identifier: MIT
-import argparse
 import numpy as np
 
 from air.ir import *
@@ -8,7 +7,7 @@
 from air.dialects.memref import AllocOp, DeallocOp, load, store
 from air.dialects.func import FuncOp
 from air.dialects.scf import for_, yield_
-from air.backend.xrt_runner import XRTRunner, type_mapper
+from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu
 
 range_ = for_
 
@@ -29,12 +28,7 @@ def build_module(vector_size, num_subvectors):
     lineWidthInBytes = vector_size // num_subvectors
 
     # Memref type definition used by the compute core and external function
-    mem_space = IntegerAttr.get(T.i32(), MemorySpace.L1)
-    tensor_type = MemRefType.get(
-        shape=[lineWidthInBytes],
-        element_type=xrt_dtype,
-        memory_space=mem_space,
-    )
+    tensor_type = l1_memref_type([lineWidthInBytes], xrt_dtype)
 
     @FuncOp.from_py_func(memrefTyInOut, memrefTyInOut)
     def copy(arg0, arg1):
@@ -75,10 +69,7 @@ def herd_body(_tx, _ty, _sx, _sy):
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        prog="run.py",
-        description="Builds, runs, and tests the passthrough_dma example",
-    )
+    parser = make_air_parser("Builds, runs, and tests the passthrough_dma example")
     parser.add_argument(
         "-s",
         "--vector_size",
@@ -92,24 +83,6 @@ def herd_body(_tx, _ty, _sx, _sy):
         default=4,
         help="The number of sub-vectors to break the vector into",
     )
-    parser.add_argument(
-        "-v",
-        "--verbose",
-        action="store_true",
-    )
-    parser.add_argument(
-        "-p",
-        "--print-module-only",
-        action="store_true",
-    )
-    parser.add_argument(
-        "--output-format",
-        type=str,
-        choices=["xclbin", "elf"],
-        default="xclbin",
-        dest="output_format",
-        help="Output format for the compiled binary (default: xclbin)",
-    )
 
     args = parser.parse_args()
 
@@ -124,10 +97,12 @@ def herd_body(_tx, _ty, _sx, _sy):
         input_a[i] = i % 0xFF
         output_b[i] = i % 0xFF
 
-    runner = XRTRunner(
-        verbose=args.verbose,
-        output_format=args.output_format,
-        instance_name="copy",
-        runtime_loop_tiling_sizes=[4, 4],
+    exit(
+        run_on_npu(
+            args,
+            mlir_module,
+            inputs=[input_a],
+            instance_name="copy",
+            expected_outputs=[output_b],
+        )
     )
-    exit(runner.run_test(mlir_module, inputs=[input_a], expected_outputs=[output_b]))
diff --git a/programming_examples/passthrough/passthrough_dma/passthrough_dma.py b/programming_examples/passthrough/passthrough_dma/passthrough_dma.py
index 12baf5dad..58ac631a6 100644
--- a/programming_examples/passthrough/passthrough_dma/passthrough_dma.py
+++ b/programming_examples/passthrough/passthrough_dma/passthrough_dma.py
@@ -1,6 +1,5 @@
 # Copyright (C) 2024, Advanced Micro Devices, Inc.
 # SPDX-License-Identifier: MIT
-import argparse
 import numpy as np
 from ml_dtypes import bfloat16
 
@@ -9,7 +8,7 @@
 from air.dialects.memref import AllocOp, DeallocOp, load, store
 from air.dialects.func import FuncOp
 from air.dialects.scf import for_, yield_
-from air.backend.xrt_runner import XRTRunner, type_mapper
+from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu
 
 range_ = for_
 
@@ -36,12 +35,7 @@ def build_module(vector_size, num_subvectors, np_dtype):
     lineWidthInBytes = vector_size // num_subvectors
 
     # Memref type definition used by the compute core and external function
-    mem_space = IntegerAttr.get(T.i32(), MemorySpace.L1)
-    tensor_type = MemRefType.get(
-        shape=[lineWidthInBytes],
-        element_type=xrt_dtype,
-        memory_space=mem_space,
-    )
+    tensor_type = l1_memref_type([lineWidthInBytes], xrt_dtype)
 
     @FuncOp.from_py_func(memrefTyInOut, memrefTyInOut)
     def copy(arg0, arg1):
@@ -95,10 +89,7 @@ def herd_body(_tx, _ty, _sx, _sy, c, d):
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        prog="run.py",
-        description="Builds, runs, and tests the passthrough_dma example",
-    )
+    parser = make_air_parser("Builds, runs, and tests the passthrough_dma example")
     parser.add_argument(
         "-s",
         "--vector_size",
@@ -112,24 +103,6 @@ def herd_body(_tx, _ty, _sx, _sy, c, d):
         default=4,
         help="The number of sub-vectors to break the vector into",
     )
-    parser.add_argument(
-        "-v",
-        "--verbose",
-        action="store_true",
-    )
-    parser.add_argument(
-        "-p",
-        "--print-module-only",
-        action="store_true",
-    )
-    parser.add_argument(
-        "--output-format",
-        type=str,
-        choices=["xclbin", "elf"],
-        default="xclbin",
-        dest="output_format",
-        help="Output format for the compiled binary (default: xclbin)",
-    )
     parser.add_argument(
         "-t",
         "--dtype",
@@ -148,10 +121,12 @@ def herd_body(_tx, _ty, _sx, _sy, c, d):
     input_a = np.arange(args.vector_size, dtype=np_dtype)
     output_b = np.arange(args.vector_size, dtype=np_dtype)
 
-    runner = XRTRunner(
-        verbose=args.verbose,
-        output_format=args.output_format,
-        instance_name="copy",
-        runtime_loop_tiling_sizes=[4, 4],
+    exit(
+        run_on_npu(
+            args,
+            mlir_module,
+            inputs=[input_a],
+            instance_name="copy",
+            expected_outputs=[output_b],
+        )
     )
-    exit(runner.run_test(mlir_module, inputs=[input_a], expected_outputs=[output_b]))
diff --git a/programming_examples/passthrough/passthrough_kernel/passthrough_kernel.py b/programming_examples/passthrough/passthrough_kernel/passthrough_kernel.py
index 7d4917b23..ad58ec774 100644
--- a/programming_examples/passthrough/passthrough_kernel/passthrough_kernel.py
+++ b/programming_examples/passthrough/passthrough_kernel/passthrough_kernel.py
@@ -1,6 +1,5 @@
 # Copyright (C) 2024, Advanced Micro Devices, Inc.
 # SPDX-License-Identifier: MIT
-import argparse
 import numpy as np
 
 from air.ir import *
@@ -8,7 +7,7 @@
 from air.dialects.memref import AllocOp, DeallocOp
 from air.dialects.func import FuncOp
 from air.dialects.scf import for_, yield_
-from air.backend.xrt_runner import XRTRunner, type_mapper
+from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu
 
 range_ = for_
 
@@ -29,12 +28,7 @@ def build_module(vector_size, num_subvectors):
     lineWidthInBytes = vector_size // num_subvectors
 
     # Memref type definition used by the compute core and external function
-    mem_space = IntegerAttr.get(T.i32(), MemorySpace.L1)
-    tensor_type = MemRefType.get(
-        shape=[lineWidthInBytes],
-        element_type=xrt_dtype,
-        memory_space=mem_space,
-    )
+    tensor_type = l1_memref_type([lineWidthInBytes], xrt_dtype)
 
     # Function definition of the external function we will call
     passThroughLine = external_func(
@@ -77,10 +71,7 @@ def herd_body(_tx, _ty, _sx, _sy):
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        prog="run.py",
-        description="Builds, runs, and tests the passthrough_dma example",
-    )
+    parser = make_air_parser("Builds, runs, and tests the passthrough_dma example")
     parser.add_argument(
         "-s",
         "--vector_size",
@@ -94,24 +85,6 @@ def herd_body(_tx, _ty, _sx, _sy):
         default=4,
         help="The number of sub-vectors to break the vector into",
     )
-    parser.add_argument(
-        "-v",
-        "--verbose",
-        action="store_true",
-    )
-    parser.add_argument(
-        "-p",
-        "--print-module-only",
-        action="store_true",
-    )
-    parser.add_argument(
-        "--output-format",
-        type=str,
-        choices=["xclbin", "elf"],
-        default="xclbin",
-        dest="output_format",
-        help="Output format for the compiled binary (default: xclbin)",
-    )
 
     args = parser.parse_args()
 
@@ -123,10 +96,12 @@ def herd_body(_tx, _ty, _sx, _sy):
     input_a = np.arange(args.vector_size, dtype=INOUT_DATATYPE)
     output_b = np.arange(args.vector_size, dtype=INOUT_DATATYPE)
 
-    runner = XRTRunner(
-        verbose=args.verbose,
-        output_format=args.output_format,
-        instance_name="copy",
-        runtime_loop_tiling_sizes=[4, 4],
+    exit(
+        run_on_npu(
+            args,
+            mlir_module,
+            inputs=[input_a],
+            instance_name="copy",
+            expected_outputs=[output_b],
+        )
     )
-    exit(runner.run_test(mlir_module, inputs=[input_a], expected_outputs=[output_b]))
diff --git a/programming_examples/primitives/scalar_examples/scalar_invsqrt/scalar_invsqrt.py b/programming_examples/primitives/scalar_examples/scalar_invsqrt/scalar_invsqrt.py
index 8198645bd..455fd9da0 100644
--- a/programming_examples/primitives/scalar_examples/scalar_invsqrt/scalar_invsqrt.py
+++ b/programming_examples/primitives/scalar_examples/scalar_invsqrt/scalar_invsqrt.py
@@ -1,18 +1,15 @@
 # Copyright (C) 2026, Advanced Micro Devices, Inc.
 # SPDX-License-Identifier: MIT
-import argparse
 import numpy as np
 
 from air.ir import *
-from air.dialects.affine import apply as affine_apply
 from air.dialects.air import *
 from air.dialects.arith import ConstantOp
 from air.dialects.memref import AllocOp, DeallocOp, load, store
 from air.dialects.func import FuncOp
 from air.dialects.math import rsqrt
 from air.dialects.scf import for_, yield_
-from air.backend.xrt_runner import XRTRunner, type_mapper
-from air.backend.xrt import XRTBackend
+from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu
 
 range_ = for_
 
@@ -29,11 +26,7 @@ def build_module(n, tile_n, np_dtype_in):
     l3memrefTy = MemRefType.get(a_size, xrt_dtype_in)
 
     # L1 MemRefTypes
-    l1MemrefTy = MemRefType.get(
-        shape=[tile_n],
-        element_type=xrt_dtype_in,
-        memory_space=IntegerAttr.get(T.i32(), MemorySpace.L1),
-    )
+    l1MemrefTy = l1_memref_type([tile_n], xrt_dtype_in)
 
     @FuncOp.from_py_func(l3memrefTy, l3memrefTy)
     def scalar_invsqrt(arg0, arg1):
@@ -55,21 +48,7 @@ def herd_body(
             l1_out_data = AllocOp(l1MemrefTy, [], [])
 
             for _l_ivx in range_(0, n, tile_n * num_tiles):
-
-                offset_map = AffineMap.get(
-                    0,
-                    2,
-                    [
-                        AffineExpr.get_add(
-                            AffineSymbolExpr.get(0),
-                            AffineExpr.get_mul(
-                                AffineSymbolExpr.get(1),
-                                AffineConstantExpr.get(tile_n),
-                            ),
-                        )
-                    ],
-                )
-                offset = affine_apply(offset_map, [_l_ivx, _ty])
+                offset = tile_offset_1d(_l_ivx, _ty, tile_n)
 
                 dma_memcpy_nd(
                     l1_a_data,
@@ -117,19 +96,8 @@ def herd_body(
     TILE_N = 1024
     INPUT_DATATYPE = np.float32
 
-    parser = argparse.ArgumentParser(
-        prog="run.py",
-        description="Builds, runs, and tests the scalar inverse square root (1/sqrt(x)) example",
-    )
-    parser.add_argument(
-        "-v",
-        "--verbose",
-        action="store_true",
-    )
-    parser.add_argument(
-        "-p",
-        "--print-module-only",
-        action="store_true",
+    parser = make_air_parser(
+        "Builds, runs, and tests the scalar inverse square root (1/sqrt(x)) example"
     )
     parser.add_argument(
         "--n",
@@ -138,14 +106,6 @@ def herd_body(
         help="Total number of elements",
     )
     parser.add_argument("--tile-n", type=int, default=TILE_N, help="Tile size")
-    parser.add_argument(
-        "--compile-mode",
-        type=str,
-        choices=["compile-only", "compile-and-run"],
-        dest="compile_mode",
-        default="compile-and-run",
-        help="Configure to whether to run after compile",
-    )
     args = parser.parse_args()
 
     mlir_module = build_module(
@@ -162,51 +122,34 @@ def herd_body(
     np.random.seed(37)
     input_a = np.random.uniform(0.1, 10.0, args.n).astype(INPUT_DATATYPE)
 
-    if args.compile_mode == "compile-and-run":
-
-        # Stochastically sample num_sample results, and pass to XRTRunner backend for verification.
-        num_samples = 100
-        sampled_indices = np.vstack(
-            [
-                np.random.randint(0, args.n, num_samples),  # i indices
-            ]
-        )
-
-        # Compute reference results for sampled indices: 1.0 / sqrt(x)
-        sampled_values = np.array(
-            [1.0 / np.sqrt(input_a[i]) for i in sampled_indices[0]],
-            dtype=INPUT_DATATYPE,
-        )
+    # Stochastically sample num_sample results, and pass to XRTRunner backend for verification.
+    num_samples = 100
+    sampled_indices = np.vstack(
+        [
+            np.random.randint(0, args.n, num_samples),  # i indices
+        ]
+    )
 
-        # Store as a dictionary
-        sampled_data = {
-            "shape": (args.n,),
-            "indices": sampled_indices,
-            "values": sampled_values,
-        }
-
-        ###### Compile and test
-        runner = XRTRunner(
-            verbose=args.verbose,
-            omit_while_true_loop=False,
-            runtime_loop_tiling_sizes=[4, 4],
-        )
-        exit(
-            runner.run_test(
-                mlir_module,
-                inputs=[input_a],
-                stochastic_expected_outputs=[sampled_data],
-                rtol=1e-1,
-            )
-        )
+    # Compute reference results for sampled indices: 1.0 / sqrt(x)
+    sampled_values = np.array(
+        [1.0 / np.sqrt(input_a[i]) for i in sampled_indices[0]],
+        dtype=INPUT_DATATYPE,
+    )
 
-    elif args.compile_mode == "compile-only":
-        ###### Compile only
-        backend = XRTBackend(
-            verbose=args.verbose,
-            omit_while_true_loop=False,
-            runtime_loop_tiling_sizes=[4, 4],
+    # Store as a dictionary
+    sampled_data = {
+        "shape": (args.n,),
+        "indices": sampled_indices,
+        "values": sampled_values,
+    }
+
+    exit(
+        run_on_npu(
+            args,
+            mlir_module,
+            inputs=[input_a],
+            instance_name="scalar_invsqrt",
+            stochastic_expected_outputs=[sampled_data],
+            rtol=1e-1,
         )
-        module_function = backend.compile(mlir_module)
-
-        backend.unload()
+    )
diff --git a/programming_examples/primitives/scalar_examples/scalar_reciprocal/scalar_reciprocal.py b/programming_examples/primitives/scalar_examples/scalar_reciprocal/scalar_reciprocal.py
index ed936d96e..72ee100dc 100644
--- a/programming_examples/primitives/scalar_examples/scalar_reciprocal/scalar_reciprocal.py
+++ b/programming_examples/primitives/scalar_examples/scalar_reciprocal/scalar_reciprocal.py
@@ -1,17 +1,14 @@
 # Copyright (C) 2026, Advanced Micro Devices, Inc.
 # SPDX-License-Identifier: MIT
-import argparse
 import numpy as np
 
 from air.ir import *
-from air.dialects.affine import apply as affine_apply
 from air.dialects.air import *
 from air.dialects.arith import ConstantOp
 from air.dialects.memref import AllocOp, DeallocOp, load, store
 from air.dialects.func import FuncOp
 from air.dialects.scf import for_, yield_
-from air.backend.xrt_runner import XRTRunner, type_mapper
-from air.backend.xrt import XRTBackend
+from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu
 
 range_ = for_
 
@@ -28,11 +25,7 @@ def build_module(n, tile_n, np_dtype_in):
     l3memrefTy = MemRefType.get(a_size, xrt_dtype_in)
 
     # L1 MemRefTypes
-    l1MemrefTy = MemRefType.get(
-        shape=[tile_n],
-        element_type=xrt_dtype_in,
-        memory_space=IntegerAttr.get(T.i32(), MemorySpace.L1),
-    )
+    l1MemrefTy = l1_memref_type([tile_n], xrt_dtype_in)
 
     @FuncOp.from_py_func(l3memrefTy, l3memrefTy)
     def scalar_reciprocal(arg0, arg1):
@@ -54,21 +47,7 @@ def herd_body(
             l1_out_data = AllocOp(l1MemrefTy, [], [])
 
             for _l_ivx in range_(0, n, tile_n * num_tiles):
-
-                offset_map = AffineMap.get(
-                    0,
-                    2,
-                    [
-                        AffineExpr.get_add(
-                            AffineSymbolExpr.get(0),
-                            AffineExpr.get_mul(
-                                AffineSymbolExpr.get(1),
-                                AffineConstantExpr.get(tile_n),
-                            ),
-                        )
-                    ],
-                )
-                offset = affine_apply(offset_map, [_l_ivx, _ty])
+                offset = tile_offset_1d(_l_ivx, _ty, tile_n)
 
                 dma_memcpy_nd(
                     l1_a_data,
@@ -119,19 +98,8 @@ def herd_body(
     TILE_N = 1024
     INPUT_DATATYPE = np.float32
 
-    parser = argparse.ArgumentParser(
-        prog="run.py",
-        description="Builds, runs, and tests the scalar reciprocal (1/x) example",
-    )
-    parser.add_argument(
-        "-v",
-        "--verbose",
-        action="store_true",
-    )
-    parser.add_argument(
-        "-p",
-        "--print-module-only",
-        action="store_true",
+    parser = make_air_parser(
+        "Builds, runs, and tests the scalar reciprocal (1/x) example"
     )
     parser.add_argument(
         "--n",
@@ -140,14 +108,6 @@ def herd_body(
         help="Total number of elements",
     )
     parser.add_argument("--tile-n", type=int, default=TILE_N, help="Tile size")
-    parser.add_argument(
-        "--compile-mode",
-        type=str,
-        choices=["compile-only", "compile-and-run"],
-        dest="compile_mode",
-        default="compile-and-run",
-        help="Configure to whether to run after compile",
-    )
     args = parser.parse_args()
 
     mlir_module = build_module(
@@ -164,51 +124,34 @@ def herd_body(
     # Use a safe range [1, 10] to avoid division by zero or very small numbers
     input_a = np.random.uniform(1.0, 10.0, args.n).astype(INPUT_DATATYPE)
 
-    if args.compile_mode == "compile-and-run":
-
-        # Stochastically sample num_sample results, and pass to XRTRunner backend for verification.
-        num_samples = 100
-        sampled_indices = np.vstack(
-            [
-                np.random.randint(0, args.n, num_samples),  # i indices
-            ]
-        )
-
-        # Compute reference results for sampled indices: 1.0 / x
-        sampled_values = np.array(
-            [np.float32(1.0) / np.float32(input_a[i]) for i in sampled_indices[0]],
-            dtype=INPUT_DATATYPE,
-        )
+    # Stochastically sample num_sample results, and pass to XRTRunner backend for verification.
+    num_samples = 100
+    sampled_indices = np.vstack(
+        [
+            np.random.randint(0, args.n, num_samples),  # i indices
+        ]
+    )
 
-        # Store as a dictionary
-        sampled_data = {
-            "shape": (args.n,),
-            "indices": sampled_indices,
-            "values": sampled_values,
-        }
-
-        ###### Compile and test
-        runner = XRTRunner(
-            verbose=args.verbose,
-            omit_while_true_loop=False,
-            runtime_loop_tiling_sizes=[4, 4],
-        )
-        exit(
-            runner.run_test(
-                mlir_module,
-                inputs=[input_a],
-                stochastic_expected_outputs=[sampled_data],
-                rtol=1e-5,
-            )
-        )
+    # Compute reference results for sampled indices: 1.0 / x
+    sampled_values = np.array(
+        [np.float32(1.0) / np.float32(input_a[i]) for i in sampled_indices[0]],
+        dtype=INPUT_DATATYPE,
+    )
 
-    elif args.compile_mode == "compile-only":
-        ###### Compile only
-        backend = XRTBackend(
-            verbose=args.verbose,
-            omit_while_true_loop=False,
-            runtime_loop_tiling_sizes=[4, 4],
+    # Store as a dictionary
+    sampled_data = {
+        "shape": (args.n,),
+        "indices": sampled_indices,
+        "values": sampled_values,
+    }
+
+    exit(
+        run_on_npu(
+            args,
+            mlir_module,
+            inputs=[input_a],
+            instance_name="scalar_reciprocal",
+            stochastic_expected_outputs=[sampled_data],
+            rtol=1e-5,
         )
-        module_function = backend.compile(mlir_module)
-
-        backend.unload()
+    )
diff --git a/programming_examples/primitives/scalar_examples/scalar_shift_saturate/scalar_shift_saturate.py b/programming_examples/primitives/scalar_examples/scalar_shift_saturate/scalar_shift_saturate.py
index 5cff5dce4..a0b700409 100644
--- a/programming_examples/primitives/scalar_examples/scalar_shift_saturate/scalar_shift_saturate.py
+++ b/programming_examples/primitives/scalar_examples/scalar_shift_saturate/scalar_shift_saturate.py
@@ -17,19 +17,16 @@
 Uses a 1x2 AIE herd with DMA transfers between L3 and L1 memory.
 """
 
-import argparse
 import numpy as np
 
 from air.ir import *
-from air.dialects.affine import apply as affine_apply
 from air.dialects.air import *
 from air.dialects import arith
 from air.dialects.arith import ConstantOp
 from air.dialects.memref import AllocOp, DeallocOp, load, store
 from air.dialects.func import FuncOp
 from air.dialects.scf import for_, yield_
-from air.backend.xrt_runner import XRTRunner, type_mapper
-from air.backend.xrt import XRTBackend
+from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu
 
 range_ = for_
 
@@ -45,11 +42,7 @@ def build_module(n, tile_n, np_dtype, shift_amount=4):
     l3memrefTy = MemRefType.get([n], xrt_dtype)
 
     # L1 MemRefTypes
-    l1MemrefTy = MemRefType.get(
-        shape=[tile_n],
-        element_type=xrt_dtype,
-        memory_space=IntegerAttr.get(T.i32(), MemorySpace.L1),
-    )
+    l1MemrefTy = l1_memref_type([tile_n], xrt_dtype)
 
     @FuncOp.from_py_func(l3memrefTy, l3memrefTy)
     def scalar_shift_saturate(arg0, arg1):
@@ -64,21 +57,7 @@ def herd_body(_tx, _ty, _sx, _sy, _l3_in, _l3_out):
             l1_out = AllocOp(l1MemrefTy, [], [])
 
             for _l_ivx in range_(0, n, tile_n * num_tiles):
-
-                offset_map = AffineMap.get(
-                    0,
-                    2,
-                    [
-                        AffineExpr.get_add(
-                            AffineSymbolExpr.get(0),
-                            AffineExpr.get_mul(
-                                AffineSymbolExpr.get(1),
-                                AffineConstantExpr.get(tile_n),
-                            ),
-                        )
-                    ],
-                )
-                offset = affine_apply(offset_map, [_l_ivx, _ty])
+                offset = tile_offset_1d(_l_ivx, _ty, tile_n)
 
                 dma_memcpy_nd(
                     l1_in,
@@ -138,12 +117,9 @@ def herd_body(_tx, _ty, _sx, _sy, _l3_in, _l3_out):
     SHIFT_AMOUNT = 4
     INPUT_DATATYPE = np.int32
 
-    parser = argparse.ArgumentParser(
-        prog="run.py",
-        description="Builds, runs, and tests the scalar shift+saturate example",
+    parser = make_air_parser(
+        "Builds, runs, and tests the scalar shift+saturate example"
     )
-    parser.add_argument("-v", "--verbose", action="store_true")
-    parser.add_argument("-p", "--print-module-only", action="store_true")
     parser.add_argument("--n", type=int, default=N, help="Total number of elements")
     parser.add_argument("--tile-n", type=int, default=TILE_N, help="Tile size")
     parser.add_argument(
@@ -152,20 +128,6 @@ def herd_body(_tx, _ty, _sx, _sy, _l3_in, _l3_out):
         default=SHIFT_AMOUNT,
         help="Right shift amount (quantization scale factor)",
     )
-    parser.add_argument(
-        "--compile-mode",
-        type=str,
-        choices=["compile-only", "compile-and-run"],
-        dest="compile_mode",
-        default="compile-and-run",
-    )
-    parser.add_argument(
-        "--output-format",
-        type=str,
-        choices=["xclbin", "elf"],
-        default="xclbin",
-        dest="output_format",
-    )
 
     args = parser.parse_args()
 
@@ -181,53 +143,37 @@ def herd_body(_tx, _ty, _sx, _sy, _l3_in, _l3_out):
     max_val = (127 << args.shift_amount) + (1 << args.shift_amount)
     input_a = np.random.randint(-max_val, max_val, args.n, dtype=INPUT_DATATYPE)
 
-    if args.compile_mode == "compile-and-run":
-        num_samples = 100
-        sampled_indices = np.vstack([np.random.randint(0, args.n, num_samples)])
-
-        # Reference: SRS (Shift-Round-Saturate) with positive_inf rounding.
-        # AIECoreToStandard sets rounding mode 9 (positive_inf) for integer SRS,
-        # which rounds toward positive infinity at the midpoint.
-        def ref_shift_saturate(x, shift):
-            shifted = (x + (1 << (shift - 1))) >> shift
-            return np.clip(shifted, -128, 127).astype(np.int8).astype(np.int32)
-
-        sampled_values = np.array(
-            [
-                ref_shift_saturate(input_a[i], args.shift_amount)
-                for i in zip(*sampled_indices)
-            ],
-            dtype=INPUT_DATATYPE,
-        )
-        sampled_data = {
-            "shape": (args.n,),
-            "indices": sampled_indices,
-            "values": sampled_values,
-        }
-
-        runner = XRTRunner(
-            verbose=args.verbose,
-            omit_while_true_loop=False,
-            output_format=args.output_format,
+    num_samples = 100
+    sampled_indices = np.vstack([np.random.randint(0, args.n, num_samples)])
+
+    # Reference: SRS (Shift-Round-Saturate) with positive_inf rounding.
+    # AIECoreToStandard sets rounding mode 9 (positive_inf) for integer SRS,
+    # which rounds toward positive infinity at the midpoint.
+    def ref_shift_saturate(x, shift):
+        shifted = (x + (1 << (shift - 1))) >> shift
+        return np.clip(shifted, -128, 127).astype(np.int8).astype(np.int32)
+
+    sampled_values = np.array(
+        [
+            ref_shift_saturate(input_a[i], args.shift_amount)
+            for i in zip(*sampled_indices)
+        ],
+        dtype=INPUT_DATATYPE,
+    )
+    sampled_data = {
+        "shape": (args.n,),
+        "indices": sampled_indices,
+        "values": sampled_values,
+    }
+
+    exit(
+        run_on_npu(
+            args,
+            mlir_module,
+            inputs=[input_a],
             instance_name="scalar_shift_saturate",
-            runtime_loop_tiling_sizes=[4, 4],
-        )
-        exit(
-            runner.run_test(
-                mlir_module,
-                inputs=[input_a],
-                stochastic_expected_outputs=[sampled_data],
-                rtol=0,
-                atol=0,
-            )
+            stochastic_expected_outputs=[sampled_data],
+            rtol=0,
+            atol=0,
         )
-
-    elif args.compile_mode == "compile-only":
-        backend = XRTBackend(
-            verbose=args.verbose,
-            omit_while_true_loop=False,
-            output_format=args.output_format,
-            runtime_loop_tiling_sizes=[4, 4],
-        )
-        module_function = backend.compile(mlir_module)
-        backend.unload()
+    )
diff --git a/programming_examples/primitives/vector_examples/vector_add/vector_add.py b/programming_examples/primitives/vector_examples/vector_add/vector_add.py
index 0008f6aaf..f587978c8 100644
--- a/programming_examples/primitives/vector_examples/vector_add/vector_add.py
+++ b/programming_examples/primitives/vector_examples/vector_add/vector_add.py
@@ -1,18 +1,36 @@
 # Copyright (C) 2025, Advanced Micro Devices, Inc.
 # SPDX-License-Identifier: MIT
-import argparse
+import os
+import sys
+
+sys.path.insert(
+    0,
+    os.path.dirname(
+        os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+    ),
+)
+
 from ml_dtypes import bfloat16
 
 from air.ir import *
-from air.dialects.affine import apply as affine_apply
 from air.dialects.air import *
 from air.dialects.arith import ConstantOp
-from air.dialects.memref import AllocOp, DeallocOp, load, store, subview
-from air.dialects.vector import transfer_read, transfer_write
+from air.dialects.memref import AllocOp, DeallocOp
 from air.dialects.func import FuncOp
 from air.dialects.scf import for_, yield_
-from air.backend.xrt_runner import XRTRunner, type_mapper
-from air.backend.xrt import XRTBackend
+from air.backend.xrt_runner import type_mapper
+from utils import (
+    make_l1_memref,
+    make_vec_type,
+    identity_map_1d,
+    tiled_1d_offset,
+    vec_read,
+    vec_write,
+    make_air_parser,
+    run_on_npu,
+    stochastic_check,
+    check_print_module,
+)
 
 import numpy as np
 
@@ -24,23 +42,16 @@
 @module_builder
 def build_module(n, tile_n, np_dtype_in, vector_size=16):
     a_size = [n]
-    b_size = a_size
-    out_size = a_size
     xrt_dtype_in = type_mapper(np_dtype_in)
     num_tiles = 2
     assert n % (tile_n * num_tiles) == 0
     VECTOR_SIZE = vector_size
     index_type = IndexType.get()
 
-    # L3 MemRefTypes
     l3memrefTy = MemRefType.get(a_size, xrt_dtype_in)
-
-    # L1 MemRefTypes
-    l1MemrefTy = MemRefType.get(
-        shape=[tile_n],
-        element_type=xrt_dtype_in,
-        memory_space=IntegerAttr.get(T.i32(), MemorySpace.L1),
-    )
+    l1MemrefTy = make_l1_memref([tile_n], xrt_dtype_in)
+    vecTy = make_vec_type(VECTOR_SIZE, xrt_dtype_in)
+    imap = identity_map_1d()
 
     @FuncOp.from_py_func(l3memrefTy, l3memrefTy, l3memrefTy)
     def vector_add(arg0, arg1, arg2):
@@ -63,97 +74,37 @@ def herd_body(
             l1_out_data = AllocOp(l1MemrefTy, [], [])
 
             for _l_ivx in range_(0, n, tile_n * num_tiles):
-
-                offset_map = AffineMap.get(
-                    0,
-                    2,
-                    [
-                        AffineExpr.get_add(
-                            AffineSymbolExpr.get(0),
-                            AffineExpr.get_mul(
-                                AffineSymbolExpr.get(1),
-                                AffineConstantExpr.get(tile_n),
-                            ),
-                        )
-                    ],
-                )
-                offset = affine_apply(offset_map, [_l_ivx, _ty])
+                offset = tiled_1d_offset(_l_ivx, _ty, tile_n)
 
                 dma_memcpy_nd(
                     l1_a_data,
                     _l3_a,
-                    src_offsets=[
-                        offset,
-                    ],
+                    src_offsets=[offset],
                     src_sizes=[tile_n],
                     src_strides=[1],
                 )
                 dma_memcpy_nd(
                     l1_b_data,
                     _l3_b,
-                    src_offsets=[
-                        offset,
-                    ],
+                    src_offsets=[offset],
                     src_sizes=[tile_n],
                     src_strides=[1],
                 )
                 c0 = ConstantOp(index_type, 0)
-                c1 = ConstantOp(index_type, 1)
                 cVecSize = ConstantOp(index_type, VECTOR_SIZE)
                 cTileN = ConstantOp(index_type, tile_n)
+                cst0 = arith.ConstantOp(xrt_dtype_in, 0.0)
                 for j in range_(c0, cTileN, cVecSize):
-                    sub_a_vec = subview(
-                        l1_a_data.result,
-                        [j],
-                        [VECTOR_SIZE],
-                        [1],
-                    )
-                    sub_b_vec = subview(
-                        l1_b_data.result,
-                        [j],
-                        [VECTOR_SIZE],
-                        [1],
-                    )
-                    sub_c_vec = subview(
-                        l1_out_data.result,
-                        [j],
-                        [VECTOR_SIZE],
-                        [1],
-                    )
-                    cst0 = arith.ConstantOp(xrt_dtype_in, 0.0)
-                    v_a = transfer_read(
-                        VectorType.get([VECTOR_SIZE], xrt_dtype_in),
-                        sub_a_vec,
-                        [c0],
-                        AffineMapAttr.get(AffineMap.get_identity(1)),
-                        cst0,
-                        [True],
-                    )
-                    v_b = transfer_read(
-                        VectorType.get([VECTOR_SIZE], xrt_dtype_in),
-                        sub_b_vec,
-                        [c0],
-                        AffineMapAttr.get(AffineMap.get_identity(1)),
-                        cst0,
-                        [True],
-                    )
+                    v_a = vec_read(l1_a_data, j, VECTOR_SIZE, c0, vecTy, cst0, imap)
+                    v_b = vec_read(l1_b_data, j, VECTOR_SIZE, c0, vecTy, cst0, imap)
                     v_c = arith.AddFOp(v_a, v_b)
-                    transfer_write(
-                        None,
-                        v_c,
-                        sub_c_vec,
-                        [c0],
-                        AffineMapAttr.get(AffineMap.get_identity(1)),
-                        [True],
-                    )
+                    vec_write(v_c, l1_out_data, j, VECTOR_SIZE, c0, imap)
                     yield_([])
 
                 dma_memcpy_nd(
                     _l3_c,
                     l1_out_data,
-                    dst_offsets=[
-                        offset,
-                    ],
+                    dst_offsets=[offset],
                     dst_sizes=[tile_n],
                     dst_strides=[1],
                 )
@@ -165,26 +116,12 @@ def herd_body(
 
 
 if __name__ == "__main__":
-    # Default values.
     N = 65536
     TILE_N = 1024
     VECTOR_SIZE = 16
     INPUT_DATATYPE = bfloat16
 
-    parser = argparse.ArgumentParser(
-        prog="run.py",
-        description="Builds, runs, and tests the passthrough_dma example",
-    )
-    parser.add_argument(
-        "-v",
-        "--verbose",
-        action="store_true",
-    )
-    parser.add_argument(
-        "-p",
-        "--print-module-only",
-        action="store_true",
-    )
+    parser = make_air_parser("Builds, runs, and tests the passthrough_dma example")
     parser.add_argument(
         "--n",
         type=int,
@@ -198,22 +135,6 @@ def herd_body(
         default=VECTOR_SIZE,
         help="Vector size for SIMD operations",
     )
-    parser.add_argument(
-        "--compile-mode",
-        type=str,
-        choices=["compile-only", "compile-and-run"],
-        dest="compile_mode",
-        default="compile-and-run",
-        help="Configure to whether to run after compile",
-    )
-    parser.add_argument(
-        "--output-format",
-        type=str,
-        choices=["xclbin", "elf"],
-        default="xclbin",
-        dest="output_format",
-        help="Output format for the compiled binary (default: xclbin)",
-    )
     parser.add_argument(
         "--bf16-emulation",
         dest="bf16_emulation",
@@ -234,65 +155,24 @@ def herd_body(
         INPUT_DATATYPE,
         args.vector_size,
     )
-    if args.print_module_only:
-        print(mlir_module)
-        exit(0)
+    check_print_module(mlir_module, args)
 
     input_a = np.arange(0, args.n, dtype=np.int64)
     input_a = input_a.astype(INPUT_DATATYPE)
     input_b = np.arange(0, args.n, dtype=np.int64)
     input_b = input_b.astype(INPUT_DATATYPE)
 
-    if args.compile_mode == "compile-and-run":
-
-        # Stochastically sample num_sample results, and pass to XRTRunner backend for verification.
-        num_samples = 100
-        sampled_indices = np.vstack(
-            [
-                np.random.randint(0, args.n, num_samples),  # i indices
-            ]
-        )
-
-        # Compute reference results for sampled indices
-        sampled_values = np.array(
-            [input_a[i] + input_b[i] for i in zip(*sampled_indices)],
-            dtype=INPUT_DATATYPE,
-        )
-
-        # Store as a dictionary
-        sampled_data = {
-            "shape": (args.n,),
-            "indices": sampled_indices,
-            "values": sampled_values,
-        }
-
-        ###### Compile and test
-        runner = XRTRunner(
-            verbose=args.verbose,
-            omit_while_true_loop=False,
-            output_format=args.output_format,
+    sampled_data = stochastic_check(
+        [input_a, input_b], args.n, lambda a, b: a + b, INPUT_DATATYPE
+    )
+    exit(
+        run_on_npu(
+            args,
+            mlir_module,
+            inputs=[input_a, input_b],
             instance_name="vector_add",
+            stochastic_expected_outputs=[sampled_data],
+            rtol=5e-2 if bf16_emulation else 1e-3,
             bf16_emulation=bf16_emulation,
-            runtime_loop_tiling_sizes=[4, 4],
-        )
-        exit(
-            runner.run_test(
-                mlir_module,
-                inputs=[input_a, input_b],
-                stochastic_expected_outputs=[sampled_data],
-                rtol=5e-2 if bf16_emulation else 1e-3,
-            )
         )
-
-    elif args.compile_mode == "compile-only":
-        ###### Compile only
-        backend = XRTBackend(
-            verbose=args.verbose,
-            omit_while_true_loop=False,
-            output_format=args.output_format,
-            bf16_emulation=bf16_emulation,
-            runtime_loop_tiling_sizes=[4, 4],
-        )
-        module_function = backend.compile(mlir_module)
-
-        backend.unload()
+    )
diff --git a/programming_examples/primitives/vector_examples/vector_broadcast_scalar/vector_broadcast_scalar.py b/programming_examples/primitives/vector_examples/vector_broadcast_scalar/vector_broadcast_scalar.py
index 715225240..447494a26 100644
--- a/programming_examples/primitives/vector_examples/vector_broadcast_scalar/vector_broadcast_scalar.py
+++ b/programming_examples/primitives/vector_examples/vector_broadcast_scalar/vector_broadcast_scalar.py
@@ -1,10 +1,18 @@
 # Copyright (C) 2025, Advanced Micro Devices, Inc.
 # SPDX-License-Identifier: MIT
-import argparse
+import os
+import sys
+
+sys.path.insert(
+    0,
+    os.path.dirname(
+        os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+    ),
+)
+
 from ml_dtypes import bfloat16
 
 from air.ir import *
-from air.dialects.affine import apply as affine_apply
 from air.dialects.air import *
 from air.dialects.arith import ConstantOp
 from air.dialects.memref import AllocOp, DeallocOp, load, store, subview, collapse_shape
@@ -19,8 +27,13 @@
 from air.dialects.func import FuncOp
 from air.dialects.scf import for_, yield_
 from air.dialects.math import exp
-from air.backend.xrt_runner import XRTRunner, type_mapper
-from air.backend.xrt import XRTBackend
+from air.backend.xrt_runner import type_mapper
+from utils import (
+    tiled_1d_offset,
+    make_air_parser,
+    run_on_npu,
+    check_print_module,
+)
 
 import numpy as np
 
@@ -44,16 +57,8 @@ def build_module(m, n, tile_m, np_dtype_in):
     l3outputMemrefTy = MemRefType.get(out_size, xrt_dtype_in)
 
     # L1 MemRefTypes
-    l1MemrefTy = MemRefType.get(
-        shape=[tile_m, 1],
-        element_type=xrt_dtype_in,
-        memory_space=IntegerAttr.get(T.i32(), MemorySpace.L1),
-    )
-    l1outputMemrefTy = MemRefType.get(
-        shape=[tile_m, n],
-        element_type=xrt_dtype_in,
-        memory_space=IntegerAttr.get(T.i32(), MemorySpace.L1),
-    )
+    l1MemrefTy = l1_memref_type([tile_m, 1], xrt_dtype_in)
+    l1outputMemrefTy = l1_memref_type([tile_m, n], xrt_dtype_in)
 
     @FuncOp.from_py_func(l3memrefTy, l3outputMemrefTy)
     def vector_broadcast_scalar(arg0, arg2):
@@ -74,28 +79,12 @@ def herd_body(
             l1_out_data = AllocOp(l1outputMemrefTy, [], [])
 
             for _l_ivx in range_(0, m, tile_m * num_tiles):
-
-                offset_map = AffineMap.get(
-                    0,
-                    2,
-                    [
-                        AffineExpr.get_add(
-                            AffineSymbolExpr.get(0),
-                            AffineExpr.get_mul(
-                                AffineSymbolExpr.get(1),
-                                AffineConstantExpr.get(tile_m),
-                            ),
-                        )
-                    ],
-                )
-                offset = affine_apply(offset_map, [_l_ivx, _ty])
+                offset = tiled_1d_offset(_l_ivx, _ty, tile_m)
 
                 dma_memcpy_nd(
                     l1_a_data,
                     _l3_a,
-                    src_offsets=[
-                        offset,
-                    ],
+                    src_offsets=[offset],
                     src_sizes=[tile_m],
                     src_strides=[1],
                 )
@@ -143,24 +132,15 @@ def herd_body(
                     )
                     cst0 = arith.ConstantOp(xrt_dtype_in, 0.0)
                     scalar = load(collapse_a, [c0])
-                    # v_a = transfer_read(
-                    #     VectorType.get([n], xrt_dtype_in),
-                    #     collapse_a,
-                    #     [c0],
-                    #     AffineMapAttr.get(AffineMap.get_identity(1)),
-                    #     cst0,
-                    #     [True],
-                    # )
 
                     v_c_broadcast = broadcast(VectorType.get([n], xrt_dtype_in), scalar)
-                    # store(v_c_broadcast, collapse_c, [c0])
 
                     transfer_write(
                         None,
                         v_c_broadcast,
                         collapse_c,
                         [c0],
-                        AffineMapAttr.get(AffineMap.get_identity(1)),
+                        identity_map_attr(),
                         [True],
                     )
                     yield_([])
@@ -179,26 +159,12 @@ def herd_body(
 
 
 if __name__ == "__main__":
-    # Default values.
     M = 65536
     N = 16
     TILE_M = 256
     INPUT_DATATYPE = bfloat16
 
-    parser = argparse.ArgumentParser(
-        prog="run.py",
-        description="Builds, runs, and tests the passthrough_dma example",
-    )
-    parser.add_argument(
-        "-v",
-        "--verbose",
-        action="store_true",
-    )
-    parser.add_argument(
-        "-p",
-        "--print-module-only",
-        action="store_true",
-    )
+    parser = make_air_parser("Builds, runs, and tests the passthrough_dma example")
     parser.add_argument(
         "--m",
         type=int,
@@ -212,22 +178,6 @@ def herd_body(
         help="Output size (dimension N, the broadcasted dimension)",
     )
     parser.add_argument("--tile-m", type=int, default=TILE_M, help="Tile size M")
-    parser.add_argument(
-        "--compile-mode",
-        type=str,
-        choices=["compile-only", "compile-and-run"],
-        dest="compile_mode",
-        default="compile-and-run",
-        help="Configure to whether to run after compile",
-    )
-    parser.add_argument(
-        "--output-format",
-        type=str,
-        choices=["xclbin", "elf"],
-        default="xclbin",
-        dest="output_format",
-        help="Output format for the compiled binary (default: xclbin)",
-    )
 
     args = parser.parse_args()
 
@@ -237,55 +187,33 @@ def herd_body(
         args.tile_m,
         INPUT_DATATYPE,
     )
-    if args.print_module_only:
-        print(mlir_module)
-        exit(0)
+    check_print_module(mlir_module, args)
 
     input_a = np.arange(0, (args.m), dtype=INPUT_DATATYPE).reshape(args.m, 1)
 
-    if args.compile_mode == "compile-and-run":
-
-        # Stochastically sample num_sample results, and pass to XRTRunner backend for verification.
-        num_samples = 100
-        sampled_indices = np.vstack([np.random.randint(0, args.m, num_samples)])
-
-        # Compute reference results for sampled indices
-        sampled_values = np.array(
-            [np.broadcast_to(input_a[i], (args.n,)) for i in zip(*sampled_indices)],
-            dtype=INPUT_DATATYPE,
-        )
+    # Stochastically sample num_sample results for verification.
+    num_samples = 100
+    sampled_indices = np.vstack([np.random.randint(0, args.m, num_samples)])
 
-        # Store as a dictionary
-        sampled_data = {
-            "shape": (args.m, args.n),
-            "indices": sampled_indices,
-            "values": sampled_values,
-        }
+    # Compute reference results for sampled indices
+    sampled_values = np.array(
+        [np.broadcast_to(input_a[i], (args.n,)) for i in zip(*sampled_indices)],
+        dtype=INPUT_DATATYPE,
+    )
 
-        ###### Compile and test
-        runner = XRTRunner(
-            verbose=args.verbose,
-            omit_while_true_loop=False,
-            output_format=args.output_format,
+    # Store as a dictionary
+    sampled_data = {
+        "shape": (args.m, args.n),
+        "indices": sampled_indices,
+        "values": sampled_values,
+    }
+
+    exit(
+        run_on_npu(
+            args,
+            mlir_module,
+            inputs=[input_a],
             instance_name="vector_broadcast_scalar",
-            runtime_loop_tiling_sizes=[4, 4],
-        )
-        exit(
-            runner.run_test(
-                mlir_module,
-                inputs=[input_a],
-                stochastic_expected_outputs=[sampled_data],
-            )
-        )
-
-    elif args.compile_mode == "compile-only":
-        ###### Compile only
-        backend = XRTBackend(
-            verbose=args.verbose,
-            omit_while_true_loop=False,
-            output_format=args.output_format,
-            runtime_loop_tiling_sizes=[4, 4],
+            stochastic_expected_outputs=[sampled_data],
         )
-        module_function = backend.compile(mlir_module)
-
-        backend.unload()
+    )
diff --git a/programming_examples/primitives/vector_examples/vector_div/vector_div.py b/programming_examples/primitives/vector_examples/vector_div/vector_div.py
index cec991145..129bf035c 100644
--- a/programming_examples/primitives/vector_examples/vector_div/vector_div.py
+++ b/programming_examples/primitives/vector_examples/vector_div/vector_div.py
@@ -1,18 +1,36 @@
 # Copyright (C) 2025, Advanced Micro Devices, Inc.
 # SPDX-License-Identifier: MIT
-import argparse
-from ml_dtypes import bfloat16
+import os
+import sys
+
+sys.path.insert(
+    0,
+    os.path.dirname(
+        os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+    ),
+)
 
 from air.ir import *
-from air.dialects.affine import apply as affine_apply
 from air.dialects.air import *
 from air.dialects.arith import ConstantOp
-from air.dialects.memref import AllocOp, DeallocOp, load, store, subview
-from air.dialects.vector import transfer_read, transfer_write
+from air.dialects.memref import AllocOp, DeallocOp
 from air.dialects.func import FuncOp
 from air.dialects.scf import for_, yield_
-from air.backend.xrt_runner import XRTRunner, type_mapper
-from air.backend.xrt import XRTBackend
+from air.backend.xrt_runner import type_mapper
+from utils import (
+    make_l1_memref,
+    make_vec_type,
+    identity_map_1d,
+    tiled_1d_offset,
+    vec_read,
+    vec_write,
+    make_air_parser,
+    run_on_npu,
+    stochastic_check,
+    check_print_module,
+)
+
+import numpy as np
 
 range_ = for_
 
@@ -20,8 +38,6 @@
 @module_builder
 def build_module(n, tile_n, np_dtype_in, arch="aie2"):
     a_size = [n]
-    b_size = a_size
-    out_size = a_size
     xrt_dtype_in = type_mapper(np_dtype_in)
     num_tiles = 2
     assert n % (tile_n * num_tiles) == 0
@@ -33,15 +49,10 @@ def build_module(n, tile_n, np_dtype_in, arch="aie2"):
     VECTOR_SIZE = arch_vector_sizes.get(arch, 16)  # default to 16 if unknown
     index_type = IndexType.get()
 
-    # L3 MemRefTypes
     l3memrefTy = MemRefType.get(a_size, xrt_dtype_in)
-
-    # L1 MemRefTypes
-    l1MemrefTy = MemRefType.get(
-        shape=[tile_n],
-        element_type=xrt_dtype_in,
-        memory_space=IntegerAttr.get(T.i32(), MemorySpace.L1),
-    )
+    l1MemrefTy = make_l1_memref([tile_n], xrt_dtype_in)
+    vecTy = make_vec_type(VECTOR_SIZE, xrt_dtype_in)
+    imap = identity_map_1d()
 
     @FuncOp.from_py_func(l3memrefTy, l3memrefTy, l3memrefTy)
     def vector_div(arg0, arg1, arg2):
@@ -65,97 +76,37 @@ def herd_body(
             l1_out_data = AllocOp(l1MemrefTy, [], [])
 
             for _l_ivx in range_(0, n, tile_n * num_tiles):
-
-                offset_map = AffineMap.get(
-                    0,
-                    2,
-                    [
-                        AffineExpr.get_add(
-                            AffineSymbolExpr.get(0),
-                            AffineExpr.get_mul(
-                                AffineSymbolExpr.get(1),
-                                AffineConstantExpr.get(tile_n),
-                            ),
-                        )
-                    ],
-                )
-                offset = affine_apply(offset_map, [_l_ivx, _ty])
+                offset = tiled_1d_offset(_l_ivx, _ty, tile_n)
 
                 dma_memcpy_nd(
                     l1_a_data,
                     _l3_a,
-                    src_offsets=[
-                        offset,
-                    ],
+                    src_offsets=[offset],
                     src_sizes=[tile_n],
                     src_strides=[1],
                 )
                 dma_memcpy_nd(
                     l1_b_data,
                     _l3_b,
-                    src_offsets=[
-                        offset,
-                    ],
+                    src_offsets=[offset],
                     src_sizes=[tile_n],
                     src_strides=[1],
                 )
                 c0 = ConstantOp(index_type, 0)
-                c1 = ConstantOp(index_type, 1)
                 cVecSize = ConstantOp(index_type, VECTOR_SIZE)
                 cTileN = ConstantOp(index_type, tile_n)
+                cst0 = arith.ConstantOp(xrt_dtype_in, 0.0)
                 for j in range_(c0, cTileN, cVecSize):
-                    sub_a_vec = subview(
-                        l1_a_data.result,
-                        [j],
-                        [VECTOR_SIZE],
-                        [1],
-                    )
-                    sub_b_vec = subview(
-                        l1_b_data.result,
-                        [j],
-                        [VECTOR_SIZE],
-                        [1],
-                    )
-                    sub_c_vec = subview(
-                        l1_out_data.result,
-                        [j],
-                        [VECTOR_SIZE],
-                        [1],
-                    )
-                    cst0 = arith.ConstantOp(xrt_dtype_in, 0.0)
-                    v_a = transfer_read(
-                        VectorType.get([VECTOR_SIZE], xrt_dtype_in),
-                        sub_a_vec,
-                        [c0],
-                        AffineMapAttr.get(AffineMap.get_identity(1)),
-                        cst0,
-                        [True],
-                    )
-                    v_b = transfer_read(
-                        VectorType.get([VECTOR_SIZE], xrt_dtype_in),
-                        sub_b_vec,
-                        [c0],
-                        AffineMapAttr.get(AffineMap.get_identity(1)),
-                        cst0,
-                        [True],
-                    )
+                    v_a = vec_read(l1_a_data, j, VECTOR_SIZE, c0, vecTy, cst0, imap)
+                    v_b = vec_read(l1_b_data, j, VECTOR_SIZE, c0, vecTy, cst0, imap)
                     v_c = arith.DivFOp(v_a, v_b)
-                    transfer_write(
-                        None,
-                        v_c,
-                        sub_c_vec,
-                        [c0],
-                        AffineMapAttr.get(AffineMap.get_identity(1)),
-                        [True],
-                    )
+                    vec_write(v_c, l1_out_data, j, VECTOR_SIZE, c0, imap)
                     yield_([])
 
                 dma_memcpy_nd(
                     _l3_c,
                     l1_out_data,
-                    dst_offsets=[
-                        offset,
-                    ],
+                    dst_offsets=[offset],
                     dst_sizes=[tile_n],
                     dst_strides=[1],
                 )
@@ -167,25 +118,11 @@ def herd_body(
 
 
 if __name__ == "__main__":
-    # Default values.
     N = 65536
     TILE_N = 1024
     INPUT_DATATYPE = np.float32
 
-    parser = argparse.ArgumentParser(
-        prog="run.py",
-        description="Builds, runs, and tests the vector division example",
-    )
-    parser.add_argument(
-        "-v",
-        "--verbose",
-        action="store_true",
-    )
-    parser.add_argument(
-        "-p",
-        "--print-module-only",
-        action="store_true",
-    )
+    parser = make_air_parser("Builds, runs, and tests the vector division example")
     parser.add_argument(
         "--n",
         type=int,
@@ -200,22 +137,6 @@ def herd_body(
         default="aie2",
         help="Target AIE architecture (aie2 or aie2p)",
     )
-    parser.add_argument(
-        "--compile-mode",
-        type=str,
-        choices=["compile-only", "compile-and-run"],
-        dest="compile_mode",
-        default="compile-and-run",
-        help="Configure to whether to run after compile",
-    )
-    parser.add_argument(
-        "--output-format",
-        type=str,
-        choices=["xclbin", "elf"],
-        default="xclbin",
-        dest="output_format",
-        help="Output format for the compiled binary (default: xclbin)",
-    )
 
     args = parser.parse_args()
 
@@ -225,9 +146,7 @@ def herd_body(
         INPUT_DATATYPE,
         args.arch,
     )
-    if args.print_module_only:
-        print(mlir_module)
-        exit(0)
+    check_print_module(mlir_module, args)
 
     # Generate random input vectors with fixed seed for reproducibility
     np.random.seed(37)
@@ -235,54 +154,16 @@ def herd_body(
     input_a = np.random.uniform(0.1, 10.0, args.n).astype(INPUT_DATATYPE)
     input_b = np.random.uniform(1.0, 10.0, args.n).astype(INPUT_DATATYPE)
 
-    if args.compile_mode == "compile-and-run":
-
-        # Stochastically sample num_sample results, and pass to XRTRunner backend for verification.
-        num_samples = 100
-        sampled_indices = np.vstack(
-            [
-                np.random.randint(0, args.n, num_samples),  # i indices
-            ]
-        )
-
-        # Compute reference results for sampled indices
-        sampled_values = np.array(
-            [input_a[i] / input_b[i] for i in zip(*sampled_indices)],
-            dtype=INPUT_DATATYPE,
-        )
-
-        # Store as a dictionary
-        sampled_data = {
-            "shape": (args.n,),
-            "indices": sampled_indices,
-            "values": sampled_values,
-        }
-
-        ###### Compile and test
-        runner = XRTRunner(
-            verbose=args.verbose,
-            omit_while_true_loop=False,
-            output_format=args.output_format,
+    sampled_data = stochastic_check(
+        [input_a, input_b], args.n, lambda a, b: a / b, INPUT_DATATYPE
+    )
+    exit(
+        run_on_npu(
+            args,
+            mlir_module,
+            inputs=[input_a, input_b],
             instance_name="vector_div",
-            runtime_loop_tiling_sizes=[4, 4],
+            stochastic_expected_outputs=[sampled_data],
+            rtol=1e-2,
         )
-        exit(
-            runner.run_test(
-                mlir_module,
-                inputs=[input_a, input_b],
-                stochastic_expected_outputs=[sampled_data],
-                rtol=1e-2,
-            )
-        )
-
-    elif args.compile_mode == "compile-only":
-        ###### Compile only
-        backend = XRTBackend(
-            verbose=args.verbose,
-            omit_while_true_loop=False,
-            output_format=args.output_format,
-            runtime_loop_tiling_sizes=[4, 4],
-        )
-        module_function = backend.compile(mlir_module)
-
-        backend.unload()
+    )
diff --git a/programming_examples/primitives/vector_examples/vector_exp/vector_exp.py b/programming_examples/primitives/vector_examples/vector_exp/vector_exp.py
index 51463700f..b24a54793 100644
--- a/programming_examples/primitives/vector_examples/vector_exp/vector_exp.py
+++ b/programming_examples/primitives/vector_examples/vector_exp/vector_exp.py
@@ -1,19 +1,37 @@
 # Copyright (C) 2025, Advanced Micro Devices, Inc.
 # SPDX-License-Identifier: MIT
-import argparse
+import os
+import sys
+
+sys.path.insert(
+    0,
+    os.path.dirname(
+        os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+    ),
+)
+
 from ml_dtypes import bfloat16
 
 from air.ir import *
-from air.dialects.affine import apply as affine_apply
 from air.dialects.air import *
 from air.dialects.arith import ConstantOp
-from air.dialects.memref import AllocOp, DeallocOp, load, store, subview
-from air.dialects.vector import transfer_read, transfer_write
+from air.dialects.memref import AllocOp, DeallocOp
 from air.dialects.func import FuncOp
 from air.dialects.scf import for_, yield_
 from air.dialects.math import exp
-from air.backend.xrt_runner import XRTRunner, type_mapper
-from air.backend.xrt import XRTBackend
+from air.backend.xrt_runner import type_mapper
+from utils import (
+    make_l1_memref,
+    make_vec_type,
+    identity_map_1d,
+    tiled_1d_offset,
+    vec_read,
+    vec_write,
+    make_air_parser,
+    run_on_npu,
+    stochastic_check,
+    check_print_module,
+)
 
 import numpy as np
 
@@ -25,22 +43,16 @@
 @module_builder
 def build_module(n, tile_n, np_dtype_in, arch="aie2", vector_size=16):
     a_size = [n]
-    out_size = a_size
     xrt_dtype_in = type_mapper(np_dtype_in)
     num_tiles = 2
     assert n % (tile_n * num_tiles) == 0
     VECTOR_SIZE = vector_size
     index_type = IndexType.get()
 
-    # L3 MemRefTypes
     l3memrefTy = MemRefType.get(a_size, xrt_dtype_in)
-
-    # L1 MemRefTypes
-    l1MemrefTy = MemRefType.get(
-        shape=[tile_n],
-        element_type=xrt_dtype_in,
-        memory_space=IntegerAttr.get(T.i32(), MemorySpace.L1),
-    )
+    l1MemrefTy = make_l1_memref([tile_n], xrt_dtype_in)
+    vecTy = make_vec_type(VECTOR_SIZE, xrt_dtype_in)
+    imap = identity_map_1d()
 
     @FuncOp.from_py_func(l3memrefTy, l3memrefTy)
     def vector_exp(arg0, arg2):
@@ -66,74 +78,29 @@ def herd_body(
             l1_out_data = AllocOp(l1MemrefTy, [], [])
 
             for _l_ivx in range_(0, n, tile_n * num_tiles):
-
-                offset_map = AffineMap.get(
-                    0,
-                    2,
-                    [
-                        AffineExpr.get_add(
-                            AffineSymbolExpr.get(0),
-                            AffineExpr.get_mul(
-                                AffineSymbolExpr.get(1),
-                                AffineConstantExpr.get(tile_n),
-                            ),
-                        )
-                    ],
-                )
-                offset = affine_apply(offset_map, [_l_ivx, _ty])
+                offset = tiled_1d_offset(_l_ivx, _ty, tile_n)
 
                 dma_memcpy_nd(
                     l1_a_data,
                     _l3_a,
-                    src_offsets=[
-                        offset,
-                    ],
+                    src_offsets=[offset],
                     src_sizes=[tile_n],
                     src_strides=[1],
                 )
                 c0 = ConstantOp(index_type, 0)
-                c1 = ConstantOp(index_type, 1)
                 cVecSize = ConstantOp(index_type, VECTOR_SIZE)
                 cTileN = ConstantOp(index_type, tile_n)
+                cst0 = arith.ConstantOp(xrt_dtype_in, 0.0)
                 for j in range_(c0, cTileN, cVecSize):
-                    sub_a_vec = subview(
-                        l1_a_data.result,
-                        [j],
-                        [VECTOR_SIZE],
-                        [1],
-                    )
-                    sub_c_vec = subview(
-                        l1_out_data.result,
-                        [j],
-                        [VECTOR_SIZE],
-                        [1],
-                    )
-                    cst0 = arith.ConstantOp(xrt_dtype_in, 0.0)
-                    v_a = transfer_read(
-                        VectorType.get([VECTOR_SIZE], xrt_dtype_in),
-                        sub_a_vec,
-                        [c0],
-                        AffineMapAttr.get(AffineMap.get_identity(1)),
-                        cst0,
-                        [True],
-                    )
+                    v_a = vec_read(l1_a_data, j, VECTOR_SIZE, c0, vecTy, cst0, imap)
                     v_c = exp(v_a)
-                    transfer_write(
-                        None,
-                        v_c,
-                        sub_c_vec,
-                        [c0],
-                        AffineMapAttr.get(AffineMap.get_identity(1)),
-                        [True],
-                    )
+                    vec_write(v_c, l1_out_data, j, VECTOR_SIZE, c0, imap)
                     yield_([])
 
                 dma_memcpy_nd(
                     _l3_c,
                     l1_out_data,
-                    dst_offsets=[
-                        offset,
-                    ],
+                    dst_offsets=[offset],
                     dst_sizes=[tile_n],
                     dst_strides=[1],
                 )
@@ -144,26 +111,12 @@ def herd_body(
 
 
 if __name__ == "__main__":
-    # Default values.
     N = 65536
     TILE_N = 1024
     VECTOR_SIZE = 16
     INPUT_DATATYPE = bfloat16
 
-    parser = argparse.ArgumentParser(
-        prog="run.py",
-        description="Builds, runs, and tests the passthrough_dma example",
-    )
-    parser.add_argument(
-        "-v",
-        "--verbose",
-        action="store_true",
-    )
-    parser.add_argument(
-        "-p",
-        "--print-module-only",
-        action="store_true",
-    )
+    parser = make_air_parser("Builds, runs, and tests the passthrough_dma example")
     parser.add_argument(
         "--n",
         type=int,
@@ -184,22 +137,6 @@ def herd_body(
         default="aie2",
         help="Target AIE architecture (aie2 or aie2p)",
     )
-    parser.add_argument(
-        "--compile-mode",
-        type=str,
-        choices=["compile-only", "compile-and-run"],
-        dest="compile_mode",
-        default="compile-and-run",
-        help="Configure to whether to run after compile",
-    )
-    parser.add_argument(
-        "--output-format",
-        type=str,
-        choices=["xclbin", "elf"],
-        default="xclbin",
-        dest="output_format",
-        help="Output format for the compiled binary (default: xclbin)",
-    )
 
     args = parser.parse_args()
 
@@ -210,62 +147,21 @@ def herd_body(
         args.arch,
         args.vector_size,
     )
-    if args.print_module_only:
-        print(mlir_module)
-        exit(0)
+    check_print_module(mlir_module, args)
 
     # Generate input values in a safe range for exp operation to avoid overflow
-    # Using values between -5 and 5 to ensure exp(x) stays within bfloat16 range
     input_a = np.random.uniform(-5, 5, args.n).astype(INPUT_DATATYPE)
 
-    if args.compile_mode == "compile-and-run":
-
-        # Stochastically sample num_sample results, and pass to XRTRunner backend for verification.
-        num_samples = 100
-        sampled_indices = np.vstack(
-            [
-                np.random.randint(0, args.n, num_samples),  # i indices
-            ]
-        )
-
-        # Compute reference results for sampled indices
-        sampled_values = np.array(
-            [np.exp(input_a[i]) for i in zip(*sampled_indices)],
-            dtype=INPUT_DATATYPE,
-        )
-
-        # Store as a dictionary
-        sampled_data = {
-            "shape": (args.n,),
-            "indices": sampled_indices,
-            "values": sampled_values,
-        }
-
-        ###### Compile and test
-        runner = XRTRunner(
-            verbose=args.verbose,
-            omit_while_true_loop=False,
-            output_format=args.output_format,
+    sampled_data = stochastic_check(
+        [input_a], args.n, lambda x: np.exp(x), INPUT_DATATYPE
+    )
+    exit(
+        run_on_npu(
+            args,
+            mlir_module,
+            inputs=[input_a],
             instance_name="vector_exp",
-            runtime_loop_tiling_sizes=[4, 4],
+            stochastic_expected_outputs=[sampled_data],
+            rtol=1e-1,
         )
-        exit(
-            runner.run_test(
-                mlir_module,
-                inputs=[input_a],
-                stochastic_expected_outputs=[sampled_data],
-                rtol=1e-1,
-            )
-        )
-
-    elif args.compile_mode == "compile-only":
-        ###### Compile only
-        backend = XRTBackend(
-            verbose=args.verbose,
-            omit_while_true_loop=False,
-            output_format=args.output_format,
-            runtime_loop_tiling_sizes=[4, 4],
-        )
-        module_function = backend.compile(mlir_module)
-
-        backend.unload()
+    )
diff --git a/programming_examples/primitives/vector_examples/vector_fma/vector_fma.py b/programming_examples/primitives/vector_examples/vector_fma/vector_fma.py
index 660294b39..59f2bb133 100644
--- a/programming_examples/primitives/vector_examples/vector_fma/vector_fma.py
+++ b/programming_examples/primitives/vector_examples/vector_fma/vector_fma.py
@@ -1,19 +1,38 @@
 # Copyright (C) 2026, Advanced Micro Devices, Inc.
 # SPDX-License-Identifier: MIT
-import argparse
+import os
+import sys
+
+sys.path.insert(
+    0,
+    os.path.dirname(
+        os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+    ),
+)
+
 from ml_dtypes import bfloat16
 
 from air.ir import *
-from air.dialects.affine import apply as affine_apply
 from air.dialects.air import *
 from air.dialects import arith
 from air.dialects.arith import ConstantOp
-from air.dialects.memref import AllocOp, DeallocOp, subview
-from air.dialects.vector import transfer_read, transfer_write, BroadcastOp, fma
+from air.dialects.memref import AllocOp, DeallocOp
+from air.dialects.vector import BroadcastOp, fma
 from air.dialects.func import FuncOp
 from air.dialects.scf import for_, yield_
-from air.backend.xrt_runner import XRTRunner, type_mapper
-from air.backend.xrt import XRTBackend
+from air.backend.xrt_runner import type_mapper
+from utils import (
+    make_l1_memref,
+    make_vec_type,
+    identity_map_1d,
+    tiled_1d_offset,
+    vec_read,
+    vec_write,
+    make_air_parser,
+    run_on_npu,
+    stochastic_check,
+    check_print_module,
+)
 
 import numpy as np
 
@@ -30,18 +49,10 @@ def build_module(n, tile_n, np_dtype_in, alpha=2.0, vector_size=16):
     VECTOR_SIZE = vector_size
     index_type = IndexType.get()
 
-    # L3 MemRefTypes
     l3memrefTy = MemRefType.get([n], xrt_dtype_in)
-
-    # L1 MemRefTypes
-    l1MemrefTy = MemRefType.get(
-        shape=[tile_n],
-        element_type=xrt_dtype_in,
-        memory_space=IntegerAttr.get(T.i32(), MemorySpace.L1),
-    )
-
-    vecTy = VectorType.get([VECTOR_SIZE], xrt_dtype_in)
-    identity_map = AffineMapAttr.get(AffineMap.get_identity(1))
+    l1MemrefTy = make_l1_memref([tile_n], xrt_dtype_in)
+    vecTy = make_vec_type(VECTOR_SIZE, xrt_dtype_in)
+    imap = identity_map_1d()
 
     @FuncOp.from_py_func(l3memrefTy, l3memrefTy, l3memrefTy)
     def vector_fma(arg0, arg1, arg2):
@@ -67,21 +78,7 @@ def herd_body(
             l1_out_data = AllocOp(l1MemrefTy, [], [])
 
             for _l_ivx in range_(0, n, tile_n * num_tiles):
-
-                offset_map = AffineMap.get(
-                    0,
-                    2,
-                    [
-                        AffineExpr.get_add(
-                            AffineSymbolExpr.get(0),
-                            AffineExpr.get_mul(
-                                AffineSymbolExpr.get(1),
-                                AffineConstantExpr.get(tile_n),
-                            ),
-                        )
-                    ],
-                )
-                offset = affine_apply(offset_map, [_l_ivx, _ty])
+                offset = tiled_1d_offset(_l_ivx, _ty, tile_n)
 
                 dma_memcpy_nd(
                     l1_b_data,
@@ -108,16 +105,12 @@ def herd_body(
                 v_a = BroadcastOp(vecTy, a_const)
 
                 for j in range_(c0, cTileN, cVecSize):
-                    sub_b = subview(l1_b_data.result, [j], [VECTOR_SIZE], [1])
-                    sub_c = subview(l1_c_data.result, [j], [VECTOR_SIZE], [1])
-                    sub_out = subview(l1_out_data.result, [j], [VECTOR_SIZE], [1])
-
-                    v_b = transfer_read(vecTy, sub_b, [c0], identity_map, cst0, [True])
-                    v_c = transfer_read(vecTy, sub_c, [c0], identity_map, cst0, [True])
+                    v_b = vec_read(l1_b_data, j, VECTOR_SIZE, c0, vecTy, cst0, imap)
+                    v_c = vec_read(l1_c_data, j, VECTOR_SIZE, c0, vecTy, cst0, imap)
 
                     # alpha * b + c via vector.fma
                     v_result = fma(v_a, v_b, v_c)
-                    transfer_write(None, v_result, sub_out, [c0], identity_map, [True])
+                    vec_write(v_result, l1_out_data, j, VECTOR_SIZE, c0, imap)
                     yield_([])
 
                 dma_memcpy_nd(
@@ -141,12 +134,7 @@ def herd_body(
     INPUT_DATATYPE = bfloat16
     ALPHA = 2.0
 
-    parser = argparse.ArgumentParser(
-        prog="run.py",
-        description="Builds, runs, and tests the vector_fma example",
-    )
-    parser.add_argument("-v", "--verbose", action="store_true")
-    parser.add_argument("-p", "--print-module-only", action="store_true")
+    parser = make_air_parser("Builds, runs, and tests the vector_fma example")
     parser.add_argument("--n", type=int, default=N, help="Total number of elements")
     parser.add_argument("--tile-n", type=int, default=TILE_N, help="Tile size")
     parser.add_argument(
@@ -158,20 +146,6 @@ def herd_body(
         default=VECTOR_SIZE,
         help="Vector size for SIMD operations",
     )
-    parser.add_argument(
-        "--compile-mode",
-        type=str,
-        choices=["compile-only", "compile-and-run"],
-        dest="compile_mode",
-        default="compile-and-run",
-    )
-    parser.add_argument(
-        "--output-format",
-        type=str,
-        choices=["xclbin", "elf"],
-        default="xclbin",
-        dest="output_format",
-    )
     parser.add_argument(
         "--bf16-emulation",
         dest="bf16_emulation",
@@ -189,51 +163,26 @@ def herd_body(
     mlir_module = build_module(
         args.n, args.tile_n, INPUT_DATATYPE, args.alpha, args.vector_size
     )
-    if args.print_module_only:
-        print(mlir_module)
-        exit(0)
+    check_print_module(mlir_module, args)
 
     input_b = np.random.uniform(-10.0, 10.0, args.n).astype(INPUT_DATATYPE)
     input_c = np.random.uniform(-10.0, 10.0, args.n).astype(INPUT_DATATYPE)
 
-    if args.compile_mode == "compile-and-run":
-        num_samples = 100
-        sampled_indices = np.vstack([np.random.randint(0, args.n, num_samples)])
-        sampled_values = np.array(
-            [args.alpha * input_b[i] + input_c[i] for i in zip(*sampled_indices)],
-            dtype=INPUT_DATATYPE,
-        )
-        sampled_data = {
-            "shape": (args.n,),
-            "indices": sampled_indices,
-            "values": sampled_values,
-        }
-
-        runner = XRTRunner(
-            verbose=args.verbose,
-            omit_while_true_loop=False,
-            output_format=args.output_format,
+    sampled_data = stochastic_check(
+        [input_b, input_c],
+        args.n,
+        lambda b, c: args.alpha * b + c,
+        INPUT_DATATYPE,
+    )
+    exit(
+        run_on_npu(
+            args,
+            mlir_module,
+            inputs=[input_b, input_c],
             instance_name="vector_fma",
+            stochastic_expected_outputs=[sampled_data],
+            rtol=2e-1 if bf16_emulation else 1e-2,
+            atol=5e-2 if bf16_emulation else 1e-8,
             bf16_emulation=bf16_emulation,
-            runtime_loop_tiling_sizes=[4, 4],
         )
-        exit(
-            runner.run_test(
-                mlir_module,
-                inputs=[input_b, input_c],
-                stochastic_expected_outputs=[sampled_data],
-                rtol=2e-1 if bf16_emulation else 1e-2,
-                atol=5e-2 if bf16_emulation else 1e-8,
-            )
-        )
-
-    elif args.compile_mode == "compile-only":
-        backend = XRTBackend(
-            verbose=args.verbose,
-            omit_while_true_loop=False,
-            output_format=args.output_format,
-            bf16_emulation=bf16_emulation,
-            runtime_loop_tiling_sizes=[4, 4],
-        )
-        module_function = backend.compile(mlir_module)
-        backend.unload()
+    )
diff --git a/programming_examples/primitives/vector_examples/vector_max/vector_max.py b/programming_examples/primitives/vector_examples/vector_max/vector_max.py
index d4a622cd2..067253e65 100644
--- a/programming_examples/primitives/vector_examples/vector_max/vector_max.py
+++ b/programming_examples/primitives/vector_examples/vector_max/vector_max.py
@@ -1,18 +1,36 @@
 # Copyright (C) 2026, Advanced Micro Devices, Inc.
 # SPDX-License-Identifier: MIT
-import argparse
+import os
+import sys
+
+sys.path.insert(
+    0,
+    os.path.dirname(
+        os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+    ),
+)
+
 from ml_dtypes import bfloat16
 
 from air.ir import *
-from air.dialects.affine import apply as affine_apply
 from air.dialects.air import *
 from air.dialects.arith import ConstantOp, MaximumFOp
-from air.dialects.memref import AllocOp, DeallocOp, load, store, subview
-from air.dialects.vector import transfer_read, transfer_write
+from air.dialects.memref import AllocOp, DeallocOp
 from air.dialects.func import FuncOp
 from air.dialects.scf import for_, yield_
-from air.backend.xrt_runner import XRTRunner, type_mapper
-from air.backend.xrt import XRTBackend
+from air.backend.xrt_runner import type_mapper
+from utils import (
+    make_l1_memref,
+    make_vec_type,
+    identity_map_1d,
+    tiled_1d_offset,
+    vec_read,
+    vec_write,
+    make_air_parser,
+    run_on_npu,
+    stochastic_check,
+    check_print_module,
+)
 
 import numpy as np
 
@@ -24,23 +42,16 @@
 @module_builder
 def build_module(n, tile_n, np_dtype_in, vector_size=16):
     a_size = [n]
-    b_size = a_size
-    out_size = a_size
     xrt_dtype_in = type_mapper(np_dtype_in)
     num_tiles = 2
     assert n % (tile_n * num_tiles) == 0
     VECTOR_SIZE = vector_size
     index_type = IndexType.get()
 
-    # L3 MemRefTypes
     l3memrefTy = MemRefType.get(a_size, xrt_dtype_in)
-
-    # L1 MemRefTypes
-    l1MemrefTy = MemRefType.get(
-        shape=[tile_n],
-        element_type=xrt_dtype_in,
-        memory_space=IntegerAttr.get(T.i32(), MemorySpace.L1),
-    )
+    l1MemrefTy = make_l1_memref([tile_n], xrt_dtype_in)
+    vecTy = make_vec_type(VECTOR_SIZE, xrt_dtype_in)
+    imap = identity_map_1d()
 
     @FuncOp.from_py_func(l3memrefTy, l3memrefTy, l3memrefTy)
     def vector_max(arg0, arg1, arg2):
@@ -63,97 +74,37 @@ def herd_body(
             l1_out_data = AllocOp(l1MemrefTy, [], [])
 
             for _l_ivx in range_(0, n, tile_n * num_tiles):
-
-                offset_map = AffineMap.get(
-                    0,
-                    2,
-                    [
-                        AffineExpr.get_add(
-                            AffineSymbolExpr.get(0),
-                            AffineExpr.get_mul(
-                                AffineSymbolExpr.get(1),
-                                AffineConstantExpr.get(tile_n),
-                            ),
-                        )
-                    ],
-                )
-                offset = affine_apply(offset_map, [_l_ivx, _ty])
+                offset = tiled_1d_offset(_l_ivx, _ty, tile_n)
 
                 dma_memcpy_nd(
                     l1_a_data,
                     _l3_a,
-                    src_offsets=[
-                        offset,
-                    ],
+                    src_offsets=[offset],
                     src_sizes=[tile_n],
                     src_strides=[1],
                 )
                 dma_memcpy_nd(
                     l1_b_data,
                     _l3_b,
-                    src_offsets=[
-                        offset,
-                    ],
+                    src_offsets=[offset],
                     src_sizes=[tile_n],
                     src_strides=[1],
                 )
                 c0 = ConstantOp(index_type, 0)
-                c1 = ConstantOp(index_type, 1)
                 cVecSize = ConstantOp(index_type, VECTOR_SIZE)
                 cTileN = ConstantOp(index_type, tile_n)
+                cst0 = arith.ConstantOp(xrt_dtype_in, 0.0)
                 for j in range_(c0, cTileN, cVecSize):
-                    sub_a_vec = subview(
-                        l1_a_data.result,
-                        [j],
-                        [VECTOR_SIZE],
-                        [1],
-                    )
-                    sub_b_vec = subview(
-                        l1_b_data.result,
-                        [j],
-                        [VECTOR_SIZE],
-                        [1],
-                    )
-                    sub_c_vec = subview(
-                        l1_out_data.result,
-                        [j],
-                        [VECTOR_SIZE],
-                        [1],
-                    )
-                    cst0 = arith.ConstantOp(xrt_dtype_in, 0.0)
-                    v_a = transfer_read(
-                        VectorType.get([VECTOR_SIZE], xrt_dtype_in),
-                        sub_a_vec,
-                        [c0],
-                        AffineMapAttr.get(AffineMap.get_identity(1)),
-                        cst0,
-                        [True],
-                    )
-                    v_b = transfer_read(
-                        VectorType.get([VECTOR_SIZE], xrt_dtype_in),
-                        sub_b_vec,
-                        [c0],
-                        AffineMapAttr.get(AffineMap.get_identity(1)),
-                        cst0,
-                        [True],
-                    )
+                    v_a = vec_read(l1_a_data, j, VECTOR_SIZE, c0, vecTy, cst0, imap)
+                    v_b = vec_read(l1_b_data, j, VECTOR_SIZE, c0, vecTy, cst0, imap)
                     v_c = MaximumFOp(v_a, v_b)
-                    transfer_write(
-                        None,
-                        v_c,
-                        sub_c_vec,
-                        [c0],
-                        AffineMapAttr.get(AffineMap.get_identity(1)),
-                        [True],
-                    )
+                    vec_write(v_c, l1_out_data, j, VECTOR_SIZE, c0, imap)
                     yield_([])
 
                 dma_memcpy_nd(
                     _l3_c,
                     l1_out_data,
-                    dst_offsets=[
-                        offset,
-                    ],
+                    dst_offsets=[offset],
                     dst_sizes=[tile_n],
                     dst_strides=[1],
                 )
@@ -165,26 +116,12 @@ def herd_body(
 
 
 if __name__ == "__main__":
-    # Default values.
     N = 65536
     TILE_N = 1024
     VECTOR_SIZE = 16
     INPUT_DATATYPE = bfloat16
 
-    parser = argparse.ArgumentParser(
-        prog="run.py",
-        description="Builds, runs, and tests the vector_max example",
-    )
-    parser.add_argument(
-        "-v",
-        "--verbose",
-        action="store_true",
-    )
-    parser.add_argument(
-        "-p",
-        "--print-module-only",
-        action="store_true",
-    )
+    parser = make_air_parser("Builds, runs, and tests the vector_max example")
     parser.add_argument(
         "--n",
         type=int,
@@ -198,22 +135,6 @@ def herd_body(
         default=VECTOR_SIZE,
         help="Vector size for SIMD operations",
     )
-    parser.add_argument(
-        "--compile-mode",
-        type=str,
-        choices=["compile-only", "compile-and-run"],
-        dest="compile_mode",
-        default="compile-and-run",
-        help="Configure to whether to run after compile",
-    )
-    parser.add_argument(
-        "--output-format",
-        type=str,
-        choices=["xclbin", "elf"],
-        default="xclbin",
-        dest="output_format",
-        help="Output format for the compiled binary (default: xclbin)",
-    )
     parser.add_argument(
         "--bf16-emulation",
         dest="bf16_emulation",
@@ -234,63 +155,25 @@ def herd_body(
         INPUT_DATATYPE,
         args.vector_size,
     )
-    if args.print_module_only:
-        print(mlir_module)
-        exit(0)
+    check_print_module(mlir_module, args)
 
     input_a = np.random.uniform(-100.0, 100.0, args.n).astype(INPUT_DATATYPE)
     input_b = np.random.uniform(-100.0, 100.0, args.n).astype(INPUT_DATATYPE)
 
-    if args.compile_mode == "compile-and-run":
-
-        # Stochastically sample num_sample results, and pass to XRTRunner backend for verification.
-        num_samples = 100
-        sampled_indices = np.vstack(
-            [
-                np.random.randint(0, args.n, num_samples),  # i indices
-            ]
-        )
-
-        # Compute reference results for sampled indices
-        sampled_values = np.array(
-            [max(input_a[i], input_b[i]) for i in zip(*sampled_indices)],
-            dtype=INPUT_DATATYPE,
-        )
-
-        # Store as a dictionary
-        sampled_data = {
-            "shape": (args.n,),
-            "indices": sampled_indices,
-            "values": sampled_values,
-        }
-
-        ###### Compile and test
-        runner = XRTRunner(
-            verbose=args.verbose,
-            omit_while_true_loop=False,
-            output_format=args.output_format,
+    sampled_data = stochastic_check(
+        [input_a, input_b],
+        args.n,
+        lambda a, b: max(a, b),
+        INPUT_DATATYPE,
+    )
+    exit(
+        run_on_npu(
+            args,
+            mlir_module,
+            inputs=[input_a, input_b],
             instance_name="vector_max",
+            stochastic_expected_outputs=[sampled_data],
+            rtol=5e-2 if bf16_emulation else 1e-3,
             bf16_emulation=bf16_emulation,
-            runtime_loop_tiling_sizes=[4, 4],
-        )
-        exit(
-            runner.run_test(
-                mlir_module,
-                inputs=[input_a, input_b],
-                stochastic_expected_outputs=[sampled_data],
-                rtol=5e-2 if bf16_emulation else 1e-3,
-            )
-        )
-
-    elif args.compile_mode == "compile-only":
-        ###### Compile only
-        backend = XRTBackend(
-            verbose=args.verbose,
-            omit_while_true_loop=False,
-            output_format=args.output_format,
-            bf16_emulation=bf16_emulation,
-            runtime_loop_tiling_sizes=[4, 4],
         )
-        module_function = backend.compile(mlir_module)
-
-        backend.unload()
+    )
diff --git a/programming_examples/primitives/vector_examples/vector_mul/vector_mul.py b/programming_examples/primitives/vector_examples/vector_mul/vector_mul.py
index 67bfa2248..0b8e44851 100644
--- a/programming_examples/primitives/vector_examples/vector_mul/vector_mul.py
+++ b/programming_examples/primitives/vector_examples/vector_mul/vector_mul.py
@@ -1,18 +1,36 @@
 # Copyright (C) 2025, Advanced Micro Devices, Inc.
 # SPDX-License-Identifier: MIT
-import argparse
+import os
+import sys
+
+sys.path.insert(
+    0,
+    os.path.dirname(
+        os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+    ),
+)
+
 from ml_dtypes import bfloat16
 
 from air.ir import *
-from air.dialects.affine import apply as affine_apply
 from air.dialects.air import *
 from air.dialects.arith import ConstantOp
-from air.dialects.memref import AllocOp, DeallocOp, load, store, subview
-from air.dialects.vector import transfer_read, transfer_write
+from air.dialects.memref import AllocOp, DeallocOp
 from air.dialects.func import FuncOp
 from air.dialects.scf import for_, yield_
-from air.backend.xrt_runner import XRTRunner, type_mapper
-from air.backend.xrt import XRTBackend
+from air.backend.xrt_runner import type_mapper
+from utils import (
+    make_l1_memref,
+    make_vec_type,
+    identity_map_1d,
+    tiled_1d_offset,
+    vec_read,
+    vec_write,
+    make_air_parser,
+    run_on_npu,
+    stochastic_check,
+    check_print_module,
+)
 
 import numpy as np
 
@@ -24,8 +42,6 @@
 @module_builder
 def build_module(n, tile_n, np_dtype_in, arch="aie2"):
     a_size = [n]
-    b_size = a_size
-    out_size = a_size
     xrt_dtype_in = type_mapper(np_dtype_in)
     num_tiles = 2
     assert n % (tile_n * num_tiles) == 0
@@ -37,15 +53,10 @@ def build_module(n, tile_n, np_dtype_in, arch="aie2"):
     VECTOR_SIZE = arch_vector_sizes.get(arch, 16)  # default to 16 if unknown
     index_type = IndexType.get()
 
-    # L3 MemRefTypes
     l3memrefTy = MemRefType.get(a_size, xrt_dtype_in)
-
-    # L1 MemRefTypes
-    l1MemrefTy = MemRefType.get(
-        shape=[tile_n],
-        element_type=xrt_dtype_in,
-        memory_space=IntegerAttr.get(T.i32(), MemorySpace.L1),
-    )
+    l1MemrefTy = make_l1_memref([tile_n], xrt_dtype_in)
+    vecTy = make_vec_type(VECTOR_SIZE, xrt_dtype_in)
+    imap = identity_map_1d()
 
     @FuncOp.from_py_func(l3memrefTy, l3memrefTy, l3memrefTy)
     def vector_mul(arg0, arg1, arg2):
@@ -68,97 +79,37 @@ def herd_body(
             l1_out_data = AllocOp(l1MemrefTy, [], [])
 
             for _l_ivx in range_(0, n, tile_n * num_tiles):
-
-                offset_map = AffineMap.get(
-                    0,
-                    2,
-                    [
-                        AffineExpr.get_add(
-                            AffineSymbolExpr.get(0),
-                            AffineExpr.get_mul(
-                                AffineSymbolExpr.get(1),
-                                AffineConstantExpr.get(tile_n),
-                            ),
-                        )
-                    ],
-                )
-                offset = affine_apply(offset_map, [_l_ivx, _ty])
+                offset = tiled_1d_offset(_l_ivx, _ty, tile_n)
 
                 dma_memcpy_nd(
                     l1_a_data,
                     _l3_a,
-                    src_offsets=[
-                        offset,
-                    ],
+                    src_offsets=[offset],
                     src_sizes=[tile_n],
                     src_strides=[1],
                 )
                 dma_memcpy_nd(
                     l1_b_data,
                     _l3_b,
-                    src_offsets=[
-                        offset,
-                    ],
+                    src_offsets=[offset],
                     src_sizes=[tile_n],
                     src_strides=[1],
                 )
                 c0 = ConstantOp(index_type, 0)
-                c1 = ConstantOp(index_type, 1)
                 cVecSize = ConstantOp(index_type, VECTOR_SIZE)
                 cTileN = ConstantOp(index_type, tile_n)
+                cst0 = arith.ConstantOp(xrt_dtype_in, 0.0)
                 for j in range_(c0, cTileN, cVecSize):
-                    sub_a_vec = subview(
-                        l1_a_data.result,
-                        [j],
-                        [VECTOR_SIZE],
-                        [1],
-                    )
-                    sub_b_vec = subview(
-                        l1_b_data.result,
-                        [j],
-                        [VECTOR_SIZE],
-                        [1],
-                    )
-                    sub_c_vec = subview(
-                        l1_out_data.result,
-                        [j],
-                        [VECTOR_SIZE],
-                        [1],
-                    )
-                    cst0 = arith.ConstantOp(xrt_dtype_in, 0.0)
-                    v_a = transfer_read(
-                        VectorType.get([VECTOR_SIZE], xrt_dtype_in),
-                        sub_a_vec,
-                        [c0],
-                        AffineMapAttr.get(AffineMap.get_identity(1)),
-                        cst0,
-                        [True],
-                    )
-                    v_b = transfer_read(
-                        VectorType.get([VECTOR_SIZE], xrt_dtype_in),
-                        sub_b_vec,
-                        [c0],
-                        AffineMapAttr.get(AffineMap.get_identity(1)),
-                        cst0,
-                        [True],
-                    )
+                    v_a = vec_read(l1_a_data, j, VECTOR_SIZE, c0, vecTy, cst0, imap)
+                    v_b = vec_read(l1_b_data, j, VECTOR_SIZE, c0, vecTy, cst0, imap)
                     v_c = arith.MulFOp(v_a, v_b)
-                    transfer_write(
-                        None,
-                        v_c,
-                        sub_c_vec,
-                        [c0],
-                        AffineMapAttr.get(AffineMap.get_identity(1)),
-                        [True],
-                    )
+                    vec_write(v_c, l1_out_data, j, VECTOR_SIZE, c0, imap)
                     yield_([])
 
                 dma_memcpy_nd(
                     _l3_c,
                     l1_out_data,
-                    dst_offsets=[
-                        offset,
-                    ],
+                    dst_offsets=[offset],
                     dst_sizes=[tile_n],
                     dst_strides=[1],
                 )
@@ -170,25 +121,11 @@ def herd_body(
 
 
 if __name__ == "__main__":
-    # Default values.
     N = 65536
     TILE_N = 1024
     INPUT_DATATYPE = bfloat16
 
-    parser = argparse.ArgumentParser(
-        prog="run.py",
-        description="Builds, runs, and tests the passthrough_dma example",
-    )
-    parser.add_argument(
-        "-v",
-        "--verbose",
-        action="store_true",
-    )
-    parser.add_argument(
-        "-p",
-        "--print-module-only",
-        action="store_true",
-    )
+    parser = make_air_parser("Builds, runs, and tests the passthrough_dma example")
     parser.add_argument(
         "--n",
         type=int,
@@ -203,22 +140,6 @@ def herd_body(
         default="aie2",
         help="Target AIE architecture (aie2 or aie2p)",
     )
-    parser.add_argument(
-        "--compile-mode",
-        type=str,
-        choices=["compile-only", "compile-and-run"],
-        dest="compile_mode",
-        default="compile-and-run",
-        help="Configure to whether to run after compile",
-    )
-    parser.add_argument(
-        "--output-format",
-        type=str,
-        choices=["xclbin", "elf"],
-        default="xclbin",
-        dest="output_format",
-        help="Output format for the compiled binary (default: xclbin)",
-    )
     parser.add_argument(
         "--bf16-emulation",
         dest="bf16_emulation",
@@ -239,65 +160,24 @@ def herd_body(
         INPUT_DATATYPE,
         args.arch,
     )
-    if args.print_module_only:
-        print(mlir_module)
-        exit(0)
+    check_print_module(mlir_module, args)
 
     input_a = np.arange(0, args.n, dtype=np.int64).reshape(args.n)
     input_a = input_a.astype(INPUT_DATATYPE)
     input_b = np.arange(0, args.n, dtype=np.int64).reshape(args.n)
     input_b = input_b.astype(INPUT_DATATYPE)
 
-    if args.compile_mode == "compile-and-run":
-
-        # Stochastically sample num_sample results, and pass to XRTRunner backend for verification.
-        num_samples = 100
-        sampled_indices = np.vstack(
-            [
-                np.random.randint(0, args.n, num_samples),  # i indices
-            ]
-        )
-
-        # Compute reference results for sampled indices
-        sampled_values = np.array(
-            [input_a[i] * input_b[i] for i in zip(*sampled_indices)],
-            dtype=INPUT_DATATYPE,
-        )
-
-        # Store as a dictionary
-        sampled_data = {
-            "shape": (args.n,),
-            "indices": sampled_indices,
-            "values": sampled_values,
-        }
-
-        ###### Compile and test
-        runner = XRTRunner(
-            verbose=args.verbose,
-            omit_while_true_loop=False,
-            output_format=args.output_format,
+    sampled_data = stochastic_check(
+        [input_a, input_b], args.n, lambda a, b: a * b, INPUT_DATATYPE
+    )
+    exit(
+        run_on_npu(
+            args,
+            mlir_module,
+            inputs=[input_a, input_b],
             instance_name="vector_mul",
+            stochastic_expected_outputs=[sampled_data],
+            rtol=5e-2 if bf16_emulation else 1e-2,
             bf16_emulation=bf16_emulation,
-            runtime_loop_tiling_sizes=[4, 4],
-        )
-        exit(
-            runner.run_test(
-                mlir_module,
-                inputs=[input_a, input_b],
-                stochastic_expected_outputs=[sampled_data],
-                rtol=5e-2 if bf16_emulation else 1e-2,
-            )
-        )
-
-    elif args.compile_mode == "compile-only":
-        ###### Compile only
-        backend = XRTBackend(
-            verbose=args.verbose,
-            omit_while_true_loop=False,
-            output_format=args.output_format,
-            bf16_emulation=bf16_emulation,
-            runtime_loop_tiling_sizes=[4, 4],
         )
-        module_function = backend.compile(mlir_module)
-
-        backend.unload()
+    )
diff --git a/programming_examples/primitives/vector_examples/vector_muladd/vector_muladd.py b/programming_examples/primitives/vector_examples/vector_muladd/vector_muladd.py
index 73dd547ad..27f40e815 100644
--- a/programming_examples/primitives/vector_examples/vector_muladd/vector_muladd.py
+++ b/programming_examples/primitives/vector_examples/vector_muladd/vector_muladd.py
@@ -1,19 +1,38 @@
 # Copyright (C) 2026, Advanced Micro Devices, Inc.
 # SPDX-License-Identifier: MIT
-import argparse
+import os
+import sys
+
+sys.path.insert(
+    0,
+    os.path.dirname(
+        os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+    ),
+)
+
 from ml_dtypes import bfloat16
 
 from air.ir import *
-from air.dialects.affine import apply as affine_apply
 from air.dialects.air import *
 from air.dialects import arith
 from air.dialects.arith import ConstantOp, mulf, addf
-from air.dialects.memref import AllocOp, DeallocOp, subview
-from air.dialects.vector import transfer_read, transfer_write, BroadcastOp
+from air.dialects.memref import AllocOp, DeallocOp
+from air.dialects.vector import BroadcastOp
 from air.dialects.func import FuncOp
 from air.dialects.scf import for_, yield_
-from air.backend.xrt_runner import XRTRunner, type_mapper
-from air.backend.xrt import XRTBackend
+from air.backend.xrt_runner import type_mapper
+from utils import (
+    make_l1_memref,
+    make_vec_type,
+    identity_map_1d,
+    tiled_1d_offset,
+    vec_read,
+    vec_write,
+    make_air_parser,
+    run_on_npu,
+    stochastic_check,
+    check_print_module,
+)
 
 import numpy as np
 
@@ -30,18 +49,10 @@ def build_module(n, tile_n, np_dtype_in, alpha=2.0, vector_size=16):
     VECTOR_SIZE = vector_size
     index_type = IndexType.get()
 
-    # L3 MemRefTypes
     l3memrefTy = MemRefType.get([n], xrt_dtype_in)
-
-    # L1 MemRefTypes
-    l1MemrefTy = MemRefType.get(
-        shape=[tile_n],
-        element_type=xrt_dtype_in,
-        memory_space=IntegerAttr.get(T.i32(), MemorySpace.L1),
-    )
-
-    vecTy = VectorType.get([VECTOR_SIZE], xrt_dtype_in)
-    identity_map = AffineMapAttr.get(AffineMap.get_identity(1))
+    l1MemrefTy = make_l1_memref([tile_n], xrt_dtype_in)
+    vecTy = make_vec_type(VECTOR_SIZE, xrt_dtype_in)
+    imap = identity_map_1d()
 
     @FuncOp.from_py_func(l3memrefTy, l3memrefTy, l3memrefTy)
     def vector_muladd(arg0, arg1, arg2):
@@ -67,21 +78,7 @@ def herd_body(
             l1_out_data = AllocOp(l1MemrefTy, [], [])
 
             for _l_ivx in range_(0, n, tile_n * num_tiles):
-
-                offset_map = AffineMap.get(
-                    0,
-                    2,
-                    [
-                        AffineExpr.get_add(
-                            AffineSymbolExpr.get(0),
-                            AffineExpr.get_mul(
-                                AffineSymbolExpr.get(1),
-                                AffineConstantExpr.get(tile_n),
-                            ),
-                        )
-                    ],
-                )
-                offset = affine_apply(offset_map, [_l_ivx, _ty])
+                offset = tiled_1d_offset(_l_ivx, _ty, tile_n)
 
                 dma_memcpy_nd(
                     l1_b_data,
@@ -108,18 +105,14 @@ def herd_body(
                 v_a = BroadcastOp(vecTy, a_const).result
 
                 for j in range_(c0, cTileN, cVecSize):
-                    sub_b = subview(l1_b_data.result, [j], [VECTOR_SIZE], [1])
-                    sub_c = subview(l1_c_data.result, [j], [VECTOR_SIZE], [1])
-                    sub_out = subview(l1_out_data.result, [j], [VECTOR_SIZE], [1])
-
-                    v_b = transfer_read(vecTy, sub_b, [c0], identity_map, cst0, [True])
-                    v_c = transfer_read(vecTy, sub_c, [c0], identity_map, cst0, [True])
+                    v_b = vec_read(l1_b_data, j, VECTOR_SIZE, c0, vecTy, cst0, imap)
+                    v_c = vec_read(l1_c_data, j, VECTOR_SIZE, c0, vecTy, cst0, imap)
 
                     # alpha * b + c via separate arith.mulf + arith.addf
                     # The aievec pass fuses this into aievec.mac_elem (PR #2896)
                     v_ab = mulf(v_a, v_b)
                     v_result = addf(v_ab, v_c)
-                    transfer_write(None, v_result, sub_out, [c0], identity_map, [True])
+                    vec_write(v_result, l1_out_data, j, VECTOR_SIZE, c0, imap)
                     yield_([])
 
                 dma_memcpy_nd(
@@ -143,12 +136,7 @@ def herd_body(
     INPUT_DATATYPE = bfloat16
     ALPHA = 2.0
 
-    parser = argparse.ArgumentParser(
-        prog="run.py",
-        description="Builds, runs, and tests the vector_muladd example",
-    )
-    parser.add_argument("-v", "--verbose", action="store_true")
-    parser.add_argument("-p", "--print-module-only", action="store_true")
+    parser = make_air_parser("Builds, runs, and tests the vector_muladd example")
     parser.add_argument("--n", type=int, default=N, help="Total number of elements")
     parser.add_argument("--tile-n", type=int, default=TILE_N, help="Tile size")
     parser.add_argument(
@@ -160,68 +148,30 @@ def herd_body(
         default=VECTOR_SIZE,
         help="Vector size for SIMD operations",
     )
-    parser.add_argument(
-        "--compile-mode",
-        type=str,
-        choices=["compile-only", "compile-and-run"],
-        dest="compile_mode",
-        default="compile-and-run",
-    )
-    parser.add_argument(
-        "--output-format",
-        type=str,
-        choices=["xclbin", "elf"],
-        default="xclbin",
-        dest="output_format",
-    )
 
     args = parser.parse_args()
 
     mlir_module = build_module(
         args.n, args.tile_n, INPUT_DATATYPE, args.alpha, args.vector_size
     )
-    if args.print_module_only:
-        print(mlir_module)
-        exit(0)
+    check_print_module(mlir_module, args)
 
     input_b = np.random.uniform(-10.0, 10.0, args.n).astype(INPUT_DATATYPE)
     input_c = np.random.uniform(-10.0, 10.0, args.n).astype(INPUT_DATATYPE)
 
-    if args.compile_mode == "compile-and-run":
-        num_samples = 100
-        sampled_indices = np.vstack([np.random.randint(0, args.n, num_samples)])
-        sampled_values = np.array(
-            [args.alpha * input_b[i] + input_c[i] for i in zip(*sampled_indices)],
-            dtype=INPUT_DATATYPE,
-        )
-        sampled_data = {
-            "shape": (args.n,),
-            "indices": sampled_indices,
-            "values": sampled_values,
-        }
-
-        runner = XRTRunner(
-            verbose=args.verbose,
-            omit_while_true_loop=False,
-            output_format=args.output_format,
+    sampled_data = stochastic_check(
+        [input_b, input_c],
+        args.n,
+        lambda b, c: args.alpha * b + c,
+        INPUT_DATATYPE,
+    )
+    exit(
+        run_on_npu(
+            args,
+            mlir_module,
+            inputs=[input_b, input_c],
             instance_name="vector_muladd",
-            runtime_loop_tiling_sizes=[4, 4],
+            stochastic_expected_outputs=[sampled_data],
+            rtol=1e-2,
         )
-        exit(
-            runner.run_test(
-                mlir_module,
-                inputs=[input_b, input_c],
-                stochastic_expected_outputs=[sampled_data],
-                rtol=1e-2,
-            )
-        )
-
-    elif args.compile_mode == "compile-only":
-        backend = XRTBackend(
-            verbose=args.verbose,
-            omit_while_true_loop=False,
-            output_format=args.output_format,
-            runtime_loop_tiling_sizes=[4, 4],
-        )
-        module_function = backend.compile(mlir_module)
-        backend.unload()
+    )
diff --git a/programming_examples/primitives/vector_examples/vector_reciprocal/vector_reciprocal.py b/programming_examples/primitives/vector_examples/vector_reciprocal/vector_reciprocal.py
index 000bcbd6d..0edc7de27 100644
--- a/programming_examples/primitives/vector_examples/vector_reciprocal/vector_reciprocal.py
+++ b/programming_examples/primitives/vector_examples/vector_reciprocal/vector_reciprocal.py
@@ -1,18 +1,36 @@
 # Copyright (C) 2026, Advanced Micro Devices, Inc.
 # SPDX-License-Identifier: MIT
-import argparse
+import os
+import sys
+
+sys.path.insert(
+    0,
+    os.path.dirname(
+        os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+    ),
+)
+
 import numpy as np
 
 from air.ir import *
-from air.dialects.affine import apply as affine_apply
 from air.dialects.air import *
 from air.dialects.arith import ConstantOp
-from air.dialects.memref import AllocOp, DeallocOp, subview
-from air.dialects.vector import transfer_read, transfer_write, broadcast
+from air.dialects.memref import AllocOp, DeallocOp
+from air.dialects.vector import broadcast
 from air.dialects.func import FuncOp
 from air.dialects.scf import for_, yield_
-from air.backend.xrt_runner import XRTRunner, type_mapper
-from air.backend.xrt import XRTBackend
+from air.backend.xrt_runner import type_mapper
+from utils import (
+    make_l1_memref,
+    make_vec_type,
+    identity_map_1d,
+    tiled_1d_offset,
+    vec_read,
+    vec_write,
+    make_air_parser,
+    run_on_npu,
+    check_print_module,
+)
 
 range_ = for_
 
@@ -20,7 +38,6 @@
 @module_builder
 def build_module(n, tile_n, np_dtype_in, arch="aie2"):
     a_size = [n]
-    out_size = a_size
     xrt_dtype_in = type_mapper(np_dtype_in)
     num_tiles = 2
     assert n % (tile_n * num_tiles) == 0
@@ -32,15 +49,10 @@ def build_module(n, tile_n, np_dtype_in, arch="aie2"):
     VECTOR_SIZE = arch_vector_sizes.get(arch, 16)  # default to 16 if unknown
     index_type = IndexType.get()
 
-    # L3 MemRefTypes
     l3memrefTy = MemRefType.get(a_size, xrt_dtype_in)
-
-    # L1 MemRefTypes
-    l1MemrefTy = MemRefType.get(
-        shape=[tile_n],
-        element_type=xrt_dtype_in,
-        memory_space=IntegerAttr.get(T.i32(), MemorySpace.L1),
-    )
+    l1MemrefTy = make_l1_memref([tile_n], xrt_dtype_in)
+    vecTy = make_vec_type(VECTOR_SIZE, xrt_dtype_in)
+    imap = identity_map_1d()
 
     @FuncOp.from_py_func(l3memrefTy, l3memrefTy)
     def vector_reciprocal(arg0, arg1):
@@ -62,28 +74,12 @@ def herd_body(
             l1_out_data = AllocOp(l1MemrefTy, [], [])
 
             for _l_ivx in range_(0, n, tile_n * num_tiles):
-
-                offset_map = AffineMap.get(
-                    0,
-                    2,
-                    [
-                        AffineExpr.get_add(
-                            AffineSymbolExpr.get(0),
-                            AffineExpr.get_mul(
-                                AffineSymbolExpr.get(1),
-                                AffineConstantExpr.get(tile_n),
-                            ),
-                        )
-                    ],
-                )
-                offset = affine_apply(offset_map, [_l_ivx, _ty])
+                offset = tiled_1d_offset(_l_ivx, _ty, tile_n)
 
                 dma_memcpy_nd(
                     l1_a_data,
                     _l3_a,
-                    src_offsets=[
-                        offset,
-                    ],
+                    src_offsets=[offset],
                     src_sizes=[tile_n],
                     src_strides=[1],
                 )
@@ -94,51 +90,20 @@ def herd_body(
 
                 # Create constant 1.0 scalar and broadcast to vector
                 one_scalar = arith.ConstantOp(xrt_dtype_in, 1.0)
-                one_vector = broadcast(
-                    VectorType.get([VECTOR_SIZE], xrt_dtype_in),
-                    one_scalar,
-                )
+                one_vector = broadcast(vecTy, one_scalar)
 
                 for j in range_(c0, cTileN, cVecSize):
-                    sub_a_vec = subview(
-                        l1_a_data.result,
-                        [j],
-                        [VECTOR_SIZE],
-                        [1],
-                    )
-                    sub_c_vec = subview(
-                        l1_out_data.result,
-                        [j],
-                        [VECTOR_SIZE],
-                        [1],
-                    )
                     cst0 = arith.ConstantOp(xrt_dtype_in, 0.0)
-                    v_a = transfer_read(
-                        VectorType.get([VECTOR_SIZE], xrt_dtype_in),
-                        sub_a_vec,
-                        [c0],
-                        AffineMapAttr.get(AffineMap.get_identity(1)),
-                        cst0,
-                        [True],
-                    )
+                    v_a = vec_read(l1_a_data, j, VECTOR_SIZE, c0, vecTy, cst0, imap)
                     # Compute reciprocal: 1.0 / a
                     v_c = arith.DivFOp(one_vector, v_a)
-                    transfer_write(
-                        None,
-                        v_c,
-                        sub_c_vec,
-                        [c0],
-                        AffineMapAttr.get(AffineMap.get_identity(1)),
-                        [True],
-                    )
+                    vec_write(v_c, l1_out_data, j, VECTOR_SIZE, c0, imap)
                     yield_([])
 
                 dma_memcpy_nd(
                     _l3_c,
                     l1_out_data,
-                    dst_offsets=[
-                        offset,
-                    ],
+                    dst_offsets=[offset],
                     dst_sizes=[tile_n],
                     dst_strides=[1],
                 )
@@ -149,24 +114,12 @@ def herd_body(
 
 
 if __name__ == "__main__":
-    # Default values.
     N = 65536
     TILE_N = 1024
     INPUT_DATATYPE = np.float32
 
-    parser = argparse.ArgumentParser(
-        prog="run.py",
-        description="Builds, runs, and tests the vector reciprocal (1/x) example",
-    )
-    parser.add_argument(
-        "-v",
-        "--verbose",
-        action="store_true",
-    )
-    parser.add_argument(
-        "-p",
-        "--print-module-only",
-        action="store_true",
+    parser = make_air_parser(
+        "Builds, runs, and tests the vector reciprocal (1/x) example"
     )
     parser.add_argument(
         "--n",
@@ -182,14 +135,6 @@ def herd_body(
         default="aie2",
         help="Target AIE architecture (aie2 or aie2p)",
     )
-    parser.add_argument(
-        "--compile-mode",
-        type=str,
-        choices=["compile-only", "compile-and-run"],
-        dest="compile_mode",
-        default="compile-and-run",
-        help="Configure to whether to run after compile",
-    )
     args = parser.parse_args()
 
     mlir_module = build_module(
@@ -198,60 +143,36 @@ def herd_body(
         INPUT_DATATYPE,
         args.arch,
     )
-    if args.print_module_only:
-        print(mlir_module)
-        exit(0)
+    check_print_module(mlir_module, args)
 
     # Generate random input vector with fixed seed for reproducibility
     np.random.seed(37)
     # Use a safe range [1, 10] to avoid division by zero
     input_a = np.random.uniform(1.0, 10.0, args.n).astype(INPUT_DATATYPE)
 
-    if args.compile_mode == "compile-and-run":
-
-        # Stochastically sample num_sample results, and pass to XRTRunner backend for verification.
-        num_samples = 100
-        sampled_indices = np.vstack(
-            [
-                np.random.randint(0, args.n, num_samples),  # i indices
-            ]
-        )
-
-        # Compute reference results for sampled indices: 1.0 / x
-        sampled_values = np.array(
-            [np.float32(1.0) / np.float32(input_a[i]) for i in sampled_indices[0]],
-            dtype=INPUT_DATATYPE,
-        )
-
-        # Store as a dictionary
-        sampled_data = {
-            "shape": (args.n,),
-            "indices": sampled_indices,
-            "values": sampled_values,
-        }
-
-        ###### Compile and test
-        runner = XRTRunner(
-            verbose=args.verbose,
-            omit_while_true_loop=False,
-            runtime_loop_tiling_sizes=[4, 4],
-        )
-        exit(
-            runner.run_test(
-                mlir_module,
-                inputs=[input_a],
-                stochastic_expected_outputs=[sampled_data],
-                rtol=1e-5,
-            )
-        )
-
-    elif args.compile_mode == "compile-only":
-        ###### Compile only
-        backend = XRTBackend(
-            verbose=args.verbose,
-            omit_while_true_loop=False,
-            runtime_loop_tiling_sizes=[4, 4],
+    num_samples = 100
+    sampled_indices = np.vstack(
+        [
+            np.random.randint(0, args.n, num_samples),
+        ]
+    )
+    # Compute reference results for sampled indices: 1.0 / x
+    sampled_values = np.array(
+        [np.float32(1.0) / np.float32(input_a[i]) for i in sampled_indices[0]],
+        dtype=INPUT_DATATYPE,
+    )
+    sampled_data = {
+        "shape": (args.n,),
+        "indices": sampled_indices,
+        "values": sampled_values,
+    }
+    exit(
+        run_on_npu(
+            args,
+            mlir_module,
+            inputs=[input_a],
+            instance_name="vector_reciprocal",
+            stochastic_expected_outputs=[sampled_data],
+            rtol=1e-5,
         )
-        module_function = backend.compile(mlir_module)
-
-        backend.unload()
+    )
diff --git a/programming_examples/primitives/vector_examples/vector_reduce_add/vector_reduce_add.py b/programming_examples/primitives/vector_examples/vector_reduce_add/vector_reduce_add.py
index 1f6d2862f..03bfd430a 100644
--- a/programming_examples/primitives/vector_examples/vector_reduce_add/vector_reduce_add.py
+++ b/programming_examples/primitives/vector_examples/vector_reduce_add/vector_reduce_add.py
@@ -1,10 +1,18 @@
 # Copyright (C) 2025, Advanced Micro Devices, Inc.
 # SPDX-License-Identifier: MIT
-import argparse
+import os
+import sys
+
+sys.path.insert(
+    0,
+    os.path.dirname(
+        os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+    ),
+)
+
 from ml_dtypes import bfloat16
 
 from air.ir import *
-from air.dialects.affine import apply as affine_apply
 from air.dialects.air import *
 from air.dialects.arith import ConstantOp
 from air.dialects.memref import AllocOp, DeallocOp, load, store, subview, collapse_shape
@@ -19,8 +27,17 @@
 from air.dialects.func import FuncOp
 from air.dialects.scf import for_, yield_
 from air.dialects.math import exp
-from air.backend.xrt_runner import XRTRunner, type_mapper
-from air.backend.xrt import XRTBackend
+from air.backend.xrt_runner import type_mapper
+from air.dialects import arith
+from utils import (
+    make_l1_memref,
+    identity_map_1d,
+    tiled_1d_offset,
+    make_air_parser,
+    run_on_npu,
+    stochastic_check,
+    check_print_module,
+)
 
 import numpy as np
 
@@ -43,16 +60,8 @@ def build_module(m, n, tile_m, np_dtype_in):
     l3outputMemrefTy = MemRefType.get(out_size, xrt_dtype_in)
 
     # L1 MemRefTypes
-    l1MemrefTy = MemRefType.get(
-        shape=[tile_m, n],
-        element_type=xrt_dtype_in,
-        memory_space=IntegerAttr.get(T.i32(), MemorySpace.L1),
-    )
-    l1outputMemrefTy = MemRefType.get(
-        shape=[tile_m, 1],
-        element_type=xrt_dtype_in,
-        memory_space=IntegerAttr.get(T.i32(), MemorySpace.L1),
-    )
+    l1MemrefTy = make_l1_memref([tile_m, n], xrt_dtype_in)
+    l1outputMemrefTy = make_l1_memref([tile_m, 1], xrt_dtype_in)
 
     @FuncOp.from_py_func(l3memrefTy, l3outputMemrefTy)
     def vector_reduce_add(arg0, arg2):
@@ -73,21 +82,7 @@ def herd_body(
             l1_out_data = AllocOp(l1outputMemrefTy, [], [])
 
             for _l_ivx in range_(0, m, tile_m * num_tiles):
-
-                offset_map = AffineMap.get(
-                    0,
-                    2,
-                    [
-                        AffineExpr.get_add(
-                            AffineSymbolExpr.get(0),
-                            AffineExpr.get_mul(
-                                AffineSymbolExpr.get(1),
-                                AffineConstantExpr.get(tile_m),
-                            ),
-                        )
-                    ],
-                )
-                offset = affine_apply(offset_map, [_l_ivx, _ty])
+                offset = tiled_1d_offset(_l_ivx, _ty, tile_m)
 
                 dma_memcpy_nd(
                     l1_a_data,
@@ -142,7 +137,7 @@ def herd_body(
                         VectorType.get([n], xrt_dtype_in),
                         collapse_a,
                         [c0],
-                        AffineMapAttr.get(AffineMap.get_identity(1)),
+                        identity_map_1d(),
                         cst0,
                         [True],
                     )
@@ -166,26 +161,12 @@ def herd_body(
 
 
 if __name__ == "__main__":
-    # Default values.
     M = 65536
     N = 16
     TILE_M = 256
     INPUT_DATATYPE = bfloat16
 
-    parser = argparse.ArgumentParser(
-        prog="run.py",
-        description="Builds, runs, and tests the passthrough_dma example",
-    )
-    parser.add_argument(
-        "-v",
-        "--verbose",
-        action="store_true",
-    )
-    parser.add_argument(
-        "-p",
-        "--print-module-only",
-        action="store_true",
-    )
+    parser = make_air_parser("Builds, runs, and tests the passthrough_dma example")
     parser.add_argument(
         "--m",
         type=int,
@@ -199,22 +180,6 @@ def herd_body(
         help="Input size (dimension N)",
     )
     parser.add_argument("--tile-m", type=int, default=TILE_M, help="Tile size M")
-    parser.add_argument(
-        "--compile-mode",
-        type=str,
-        choices=["compile-only", "compile-and-run"],
-        dest="compile_mode",
-        default="compile-and-run",
-        help="Configure to whether to run after compile",
-    )
-    parser.add_argument(
-        "--output-format",
-        type=str,
-        choices=["xclbin", "elf"],
-        default="xclbin",
-        dest="output_format",
-        help="Output format for the compiled binary (default: xclbin)",
-    )
 
     args = parser.parse_args()
 
@@ -224,58 +189,36 @@ def herd_body(
         args.tile_m,
         INPUT_DATATYPE,
     )
-    if args.print_module_only:
-        print(mlir_module)
-        exit(0)
+    check_print_module(mlir_module, args)
 
     input_a = np.arange(0, (args.m * args.n), dtype=INPUT_DATATYPE).reshape(
         args.m, args.n
     )
 
-    if args.compile_mode == "compile-and-run":
-
-        # Stochastically sample num_sample results, and pass to XRTRunner backend for verification.
-        num_samples = 100
-        sampled_indices = np.vstack([np.random.randint(0, args.m, num_samples)])
-
-        # Compute reference results for sampled indices
-        sampled_values = np.array(
-            [np.sum(input_a[i]) for i in zip(*sampled_indices)],
-            dtype=INPUT_DATATYPE,
-        )
+    # Stochastically sample num_sample results for verification.
+    num_samples = 100
+    sampled_indices = np.vstack([np.random.randint(0, args.m, num_samples)])
 
-        # Store as a dictionary
-        sampled_data = {
-            "shape": (args.m,),
-            "indices": sampled_indices,
-            "values": sampled_values,
-        }
+    # Compute reference results for sampled indices
+    sampled_values = np.array(
+        [np.sum(input_a[i]) for i in zip(*sampled_indices)],
+        dtype=INPUT_DATATYPE,
+    )
 
-        ###### Compile and test
-        runner = XRTRunner(
-            verbose=args.verbose,
-            omit_while_true_loop=False,
-            output_format=args.output_format,
+    # Store as a dictionary
+    sampled_data = {
+        "shape": (args.m,),
+        "indices": sampled_indices,
+        "values": sampled_values,
+    }
+
+    exit(
+        run_on_npu(
+            args,
+            mlir_module,
+            inputs=[input_a],
             instance_name="vector_reduce_add",
-            runtime_loop_tiling_sizes=[4, 4],
-        )
-        exit(
-            runner.run_test(
-                mlir_module,
-                inputs=[input_a],
-                stochastic_expected_outputs=[sampled_data],
-                rtol=1e-1,
-            )
-        )
-
-    elif args.compile_mode == "compile-only":
-        ###### Compile only
-        backend = XRTBackend(
-            verbose=args.verbose,
-            omit_while_true_loop=False,
-            output_format=args.output_format,
-            runtime_loop_tiling_sizes=[4, 4],
+            stochastic_expected_outputs=[sampled_data],
+            rtol=1e-1,
         )
-        module_function = backend.compile(mlir_module)
-
-        backend.unload()
+    )
diff --git a/programming_examples/primitives/vector_examples/vector_reduce_max/vector_reduce_max.py b/programming_examples/primitives/vector_examples/vector_reduce_max/vector_reduce_max.py
index 3a1589443..c02bd64dd 100644
--- a/programming_examples/primitives/vector_examples/vector_reduce_max/vector_reduce_max.py
+++ b/programming_examples/primitives/vector_examples/vector_reduce_max/vector_reduce_max.py
@@ -1,10 +1,18 @@
 # Copyright (C) 2025, Advanced Micro Devices, Inc.
 # SPDX-License-Identifier: MIT
-import argparse
+import os
+import sys
+
+sys.path.insert(
+    0,
+    os.path.dirname(
+        os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+    ),
+)
+
 from ml_dtypes import bfloat16
 
 from air.ir import *
-from air.dialects.affine import apply as affine_apply
 from air.dialects.air import *
 from air.dialects.arith import ConstantOp
 from air.dialects.memref import AllocOp, DeallocOp, load, store, subview, collapse_shape
@@ -19,8 +27,14 @@
 from air.dialects.func import FuncOp
 from air.dialects.scf import for_, yield_
 from air.dialects.math import exp
-from air.backend.xrt_runner import XRTRunner, type_mapper
-from air.backend.xrt import XRTBackend
+from air.backend.xrt_runner import type_mapper
+from utils import (
+    make_l1_memref,
+    tiled_1d_offset,
+    make_air_parser,
+    run_on_npu,
+    check_print_module,
+)
 
 import numpy as np
 
@@ -44,16 +58,8 @@ def build_module(m, n, tile_m, np_dtype_in):
     l3outputMemrefTy = MemRefType.get(out_size, xrt_dtype_in)
 
     # L1 MemRefTypes
-    l1MemrefTy = MemRefType.get(
-        shape=[tile_m, n],
-        element_type=xrt_dtype_in,
-        memory_space=IntegerAttr.get(T.i32(), MemorySpace.L1),
-    )
-    l1outputMemrefTy = MemRefType.get(
-        shape=[tile_m, 1],
-        element_type=xrt_dtype_in,
-        memory_space=IntegerAttr.get(T.i32(), MemorySpace.L1),
-    )
+    l1MemrefTy = make_l1_memref([tile_m, n], xrt_dtype_in)
+    l1outputMemrefTy = make_l1_memref([tile_m, 1], xrt_dtype_in)
 
     @FuncOp.from_py_func(l3memrefTy, l3outputMemrefTy)
     def vector_reduce_max(arg0, arg2):
@@ -74,21 +80,7 @@ def herd_body(
             l1_out_data = AllocOp(l1outputMemrefTy, [], [])
 
             for _l_ivx in range_(0, m, tile_m * num_tiles):
-
-                offset_map = AffineMap.get(
-                    0,
-                    2,
-                    [
-                        AffineExpr.get_add(
-                            AffineSymbolExpr.get(0),
-                            AffineExpr.get_mul(
-                                AffineSymbolExpr.get(1),
-                                AffineConstantExpr.get(tile_m),
-                            ),
-                        )
-                    ],
-                )
-                offset = affine_apply(offset_map, [_l_ivx, _ty])
+                offset = tiled_1d_offset(_l_ivx, _ty, tile_m)
 
                 dma_memcpy_nd(
                     l1_a_data,
@@ -144,7 +136,7 @@ def herd_body(
                         VectorType.get([n], xrt_dtype_in),
                         collapse_a,
                         [c0],
-                        AffineMapAttr.get(AffineMap.get_identity(1)),
+                        identity_map_attr(),
                         cst0,
                         [True],
                     )
@@ -168,26 +160,12 @@ def herd_body(
 
 
 if __name__ == "__main__":
-    # Default values.
     M = 65536
     N = 32
     TILE_M = 256
     INPUT_DATATYPE = bfloat16
 
-    parser = argparse.ArgumentParser(
-        prog="run.py",
-        description="Builds, runs, and tests the passthrough_dma example",
-    )
-    parser.add_argument(
-        "-v",
-        "--verbose",
-        action="store_true",
-    )
-    parser.add_argument(
-        "-p",
-        "--print-module-only",
-        action="store_true",
-    )
+    parser = make_air_parser("Builds, runs, and tests the passthrough_dma example")
     parser.add_argument(
         "--m",
         type=int,
@@ -201,22 +179,6 @@ def herd_body(
         help="Input size (dimension N)",
     )
     parser.add_argument("--tile-m", type=int, default=TILE_M, help="Tile size M")
-    parser.add_argument(
-        "--compile-mode",
-        type=str,
-        choices=["compile-only", "compile-and-run"],
-        dest="compile_mode",
-        default="compile-and-run",
-        help="Configure to whether to run after compile",
-    )
-    parser.add_argument(
-        "--output-format",
-        type=str,
-        choices=["xclbin", "elf"],
-        default="xclbin",
-        dest="output_format",
-        help="Output format for the compiled binary (default: xclbin)",
-    )
 
     args = parser.parse_args()
 
@@ -226,58 +188,36 @@ def herd_body(
         args.tile_m,
         INPUT_DATATYPE,
     )
-    if args.print_module_only:
-        print(mlir_module)
-        exit(0)
+    check_print_module(mlir_module, args)
 
     input_a = np.arange(0, (args.m * args.n), dtype=INPUT_DATATYPE).reshape(
         args.m, args.n
     )
 
-    if args.compile_mode == "compile-and-run":
-
-        # Stochastically sample num_sample results, and pass to XRTRunner backend for verification.
-        num_samples = 100
-        sampled_indices = np.vstack([np.random.randint(0, args.m, num_samples)])
-
-        # Compute reference results for sampled indices
-        sampled_values = np.array(
-            [np.max(input_a[i]) for i in zip(*sampled_indices)],
-            dtype=INPUT_DATATYPE,
-        )
+    # Stochastically sample num_sample results for verification.
+    num_samples = 100
+    sampled_indices = np.vstack([np.random.randint(0, args.m, num_samples)])
 
-        # Store as a dictionary
-        sampled_data = {
-            "shape": (args.m,),
-            "indices": sampled_indices,
-            "values": sampled_values,
-        }
+    # Compute reference results for sampled indices
+    sampled_values = np.array(
+        [np.max(input_a[i]) for i in zip(*sampled_indices)],
+        dtype=INPUT_DATATYPE,
+    )
 
-        ###### Compile and test
-        runner = XRTRunner(
-            verbose=args.verbose,
-            omit_while_true_loop=False,
-            output_format=args.output_format,
+    # Store as a dictionary
+    sampled_data = {
+        "shape": (args.m,),
+        "indices": sampled_indices,
+        "values": sampled_values,
+    }
+
+    exit(
+        run_on_npu(
+            args,
+            mlir_module,
+            inputs=[input_a],
             instance_name="vector_reduce_max",
-            runtime_loop_tiling_sizes=[4, 4],
-        )
-        exit(
-            runner.run_test(
-                mlir_module,
-                inputs=[input_a],
-                stochastic_expected_outputs=[sampled_data],
-                rtol=1e-1,
-            )
-        )
-
-    elif args.compile_mode == "compile-only":
-        ###### Compile only
-        backend = XRTBackend(
-            verbose=args.verbose,
-            omit_while_true_loop=False,
-            output_format=args.output_format,
-            runtime_loop_tiling_sizes=[4, 4],
+            stochastic_expected_outputs=[sampled_data],
+            rtol=1e-1,
         )
-        module_function = backend.compile(mlir_module)
-
-        backend.unload()
+    )
diff --git a/programming_examples/primitives/vector_examples/vector_rsqrt/vector_rsqrt_v1.py b/programming_examples/primitives/vector_examples/vector_rsqrt/vector_rsqrt_v1.py
index 8c0bd638f..26ce44e4c 100644
--- a/programming_examples/primitives/vector_examples/vector_rsqrt/vector_rsqrt_v1.py
+++ b/programming_examples/primitives/vector_examples/vector_rsqrt/vector_rsqrt_v1.py
@@ -1,21 +1,39 @@
 # Copyright (C) 2025, Advanced Micro Devices, Inc.
 # SPDX-License-Identifier: MIT
 # Version 1: f32 vector rsqrt
-import argparse
+import os
+import sys
+
+sys.path.insert(
+    0,
+    os.path.dirname(
+        os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+    ),
+)
+
 import numpy as np
 from ml_dtypes import bfloat16
 
 from air.ir import *
-from air.dialects.affine import apply as affine_apply
 from air.dialects.air import *
 from air.dialects.arith import ConstantOp
-from air.dialects.memref import AllocOp, DeallocOp, load, store, subview
-from air.dialects.vector import transfer_read, transfer_write
+from air.dialects.memref import AllocOp, DeallocOp
 from air.dialects.func import FuncOp
 from air.dialects.math import rsqrt
 from air.dialects.scf import for_, yield_
-from air.backend.xrt_runner import XRTRunner, type_mapper
-from air.backend.xrt import XRTBackend
+from air.backend.xrt_runner import type_mapper
+from utils import (
+    make_l1_memref,
+    make_vec_type,
+    identity_map_1d,
+    tiled_1d_offset,
+    vec_read,
+    vec_write,
+    make_air_parser,
+    run_on_npu,
+    stochastic_check,
+    check_print_module,
+)
 
 range_ = for_
 
@@ -23,22 +41,16 @@
 @module_builder
 def build_module(n, tile_n, np_dtype_in, arch="aie2"):
     a_size = [n]
-    out_size = a_size
     xrt_dtype_in = type_mapper(np_dtype_in)
     num_tiles = 2
     assert n % (tile_n * num_tiles) == 0
     VECTOR_SIZE = 16
     index_type = IndexType.get()
 
-    # L3 MemRefTypes
     l3memrefTy = MemRefType.get(a_size, xrt_dtype_in)
-
-    # L1 MemRefTypes
-    l1MemrefTy = MemRefType.get(
-        shape=[tile_n],
-        element_type=xrt_dtype_in,
-        memory_space=IntegerAttr.get(T.i32(), MemorySpace.L1),
-    )
+    l1MemrefTy = make_l1_memref([tile_n], xrt_dtype_in)
+    vecTy = make_vec_type(VECTOR_SIZE, xrt_dtype_in)
+    imap = identity_map_1d()
 
     @FuncOp.from_py_func(l3memrefTy, l3memrefTy)
     def vector_rsqrt(arg0, arg2):
@@ -64,75 +76,29 @@ def herd_body(
             l1_out_data = AllocOp(l1MemrefTy, [], [])
 
             for _l_ivx in range_(0, n, tile_n * num_tiles):
-
-                offset_map = AffineMap.get(
-                    0,
-                    2,
-                    [
-                        AffineExpr.get_add(
-                            AffineSymbolExpr.get(0),
-                            AffineExpr.get_mul(
-                                AffineSymbolExpr.get(1),
-                                AffineConstantExpr.get(tile_n),
-                            ),
-                        )
-                    ],
-                )
-                offset = affine_apply(offset_map, [_l_ivx, _ty])
+                offset = tiled_1d_offset(_l_ivx, _ty, tile_n)
 
                 dma_memcpy_nd(
                     l1_a_data,
                     _l3_a,
-                    src_offsets=[
-                        offset,
-                    ],
+                    src_offsets=[offset],
                     src_sizes=[tile_n],
                     src_strides=[1],
                 )
                 c0 = ConstantOp(index_type, 0)
-                c1 = ConstantOp(index_type, 1)
                 cVecSize = ConstantOp(index_type, VECTOR_SIZE)
                 cTileN = ConstantOp(index_type, tile_n)
+                cst0 = arith.ConstantOp(xrt_dtype_in, 0.0)
                 for j in range_(c0, cTileN, cVecSize):
-                    sub_a_vec = subview(
-                        l1_a_data.result,
-                        [j],
-                        [VECTOR_SIZE],
-                        [1],
-                    )
-                    sub_c_vec = subview(
-                        l1_out_data.result,
-                        [j],
-                        [VECTOR_SIZE],
-                        [1],
-                    )
-
-                    cst0 = arith.ConstantOp(xrt_dtype_in, 0.0)
-                    v_a = transfer_read(
-                        VectorType.get([VECTOR_SIZE], xrt_dtype_in),
-                        sub_a_vec,
-                        [c0],
-                        AffineMapAttr.get(AffineMap.get_identity(1)),
-                        cst0,
-                        [True],
-                    )
+                    v_a = vec_read(l1_a_data, j, VECTOR_SIZE, c0, vecTy, cst0, imap)
                     v_c = rsqrt(v_a)
-                    transfer_write(
-                        None,
-                        v_c,
-                        sub_c_vec,
-                        [c0],
-                        AffineMapAttr.get(AffineMap.get_identity(1)),
-                        [True],
-                    )
+                    vec_write(v_c, l1_out_data, j, VECTOR_SIZE, c0, imap)
                     yield_([])
 
                 dma_memcpy_nd(
                     _l3_c,
                     l1_out_data,
-                    dst_offsets=[
-                        offset,
-                    ],
+                    dst_offsets=[offset],
                     dst_sizes=[tile_n],
                     dst_strides=[1],
                 )
@@ -143,23 +109,11 @@ def herd_body(
 
 
 if __name__ == "__main__":
-    # Default values.
     N = 512
     TILE_N = 64
 
-    parser = argparse.ArgumentParser(
-        prog="run.py",
-        description="Builds, runs, and tests the vector_rsqrt example (Version 1: f32 vector)",
-    )
-    parser.add_argument(
-        "-v",
-        "--verbose",
-        action="store_true",
-    )
-    parser.add_argument(
-        "-p",
-        "--print-module-only",
-        action="store_true",
+    parser = make_air_parser(
+        "Builds, runs, and tests the vector_rsqrt example (Version 1: f32 vector)"
     )
     parser.add_argument(
         "--n",
@@ -175,22 +129,6 @@ def herd_body(
         default="aie2",
         help="Target AIE architecture (aie2 or aie2p)",
     )
-    parser.add_argument(
-        "--compile-mode",
-        type=str,
-        choices=["compile-only", "compile-and-run"],
-        dest="compile_mode",
-        default="compile-and-run",
-        help="Configure to whether to run after compile",
-    )
-    parser.add_argument(
-        "--output-format",
-        type=str,
-        choices=["xclbin", "elf"],
-        default="xclbin",
-        dest="output_format",
-        help="Output format for the compiled binary (default: xclbin)",
-    )
 
     args = parser.parse_args()
 
@@ -208,63 +146,34 @@ def herd_body(
         INPUT_DATATYPE,
         args.arch,
     )
-    if args.print_module_only:
-        print(mlir_module)
-        exit(0)
+    check_print_module(mlir_module, args)
 
     # Generate input values in range [0.1, 3.0] to match working testbench pattern
-    # This ensures positive values (required for rsqrt) and stays well within bfloat16 range
     np.random.seed(10)
     input_a = np.abs(np.random.uniform(0.1, 3.0, args.n)).astype(INPUT_DATATYPE)
 
-    if args.compile_mode == "compile-and-run":
-
-        # Stochastically sample num_sample results, and pass to XRTRunner backend for verification.
-        num_samples = 100
-        sampled_indices = np.vstack(
-            [
-                np.random.randint(0, args.n, num_samples),  # i indices
-            ]
-        )
-
-        # Compute reference results for sampled indices
-        sampled_values = np.array(
-            [1.0 / np.sqrt(input_a[i]) for i in sampled_indices[0]],
-            dtype=INPUT_DATATYPE,
-        )
-
-        # Store as a dictionary
-        sampled_data = {
-            "shape": (args.n,),
-            "indices": sampled_indices,
-            "values": sampled_values,
-        }
-
-        ###### Compile and test
-        runner = XRTRunner(
-            verbose=args.verbose,
-            omit_while_true_loop=False,
-            output_format=args.output_format,
+    num_samples = 100
+    sampled_indices = np.vstack(
+        [
+            np.random.randint(0, args.n, num_samples),
+        ]
+    )
+    sampled_values = np.array(
+        [1.0 / np.sqrt(input_a[i]) for i in sampled_indices[0]],
+        dtype=INPUT_DATATYPE,
+    )
+    sampled_data = {
+        "shape": (args.n,),
+        "indices": sampled_indices,
+        "values": sampled_values,
+    }
+    exit(
+        run_on_npu(
+            args,
+            mlir_module,
+            inputs=[input_a],
             instance_name="vector_rsqrt",
-            runtime_loop_tiling_sizes=[4, 4],
+            stochastic_expected_outputs=[sampled_data],
+            rtol=1e-1,
         )
-        exit(
-            runner.run_test(
-                mlir_module,
-                inputs=[input_a],
-                stochastic_expected_outputs=[sampled_data],
-                rtol=1e-1,
-            )
-        )
-
-    elif args.compile_mode == "compile-only":
-        ###### Compile only
-        backend = XRTBackend(
-            verbose=args.verbose,
-            omit_while_true_loop=False,
-            output_format=args.output_format,
-            runtime_loop_tiling_sizes=[4, 4],
-        )
-        module_function = backend.compile(mlir_module)
-
-        backend.unload()
+    )
diff --git a/programming_examples/primitives/vector_examples/vector_rsqrt/vector_rsqrt_v2.py b/programming_examples/primitives/vector_examples/vector_rsqrt/vector_rsqrt_v2.py
index 02601fa95..9f54a7dfe 100644
--- a/programming_examples/primitives/vector_examples/vector_rsqrt/vector_rsqrt_v2.py
+++ b/programming_examples/primitives/vector_examples/vector_rsqrt/vector_rsqrt_v2.py
@@ -1,20 +1,34 @@
 # Copyright (C) 2025, Advanced Micro Devices, Inc.
 # SPDX-License-Identifier: MIT
 # Version 2: f32 scalar rsqrt in loop
-import argparse
+import os
+import sys
+
+sys.path.insert(
+    0,
+    os.path.dirname(
+        os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+    ),
+)
+
+import numpy as np
 from ml_dtypes import bfloat16
 
 from air.ir import *
-from air.dialects.affine import apply as affine_apply
 from air.dialects.air import *
 from air.dialects.arith import ConstantOp
 from air.dialects.memref import AllocOp, DeallocOp, load, store, subview
-from air.dialects.vector import transfer_read, transfer_write
 from air.dialects.func import FuncOp
 from air.dialects.math import rsqrt
 from air.dialects.scf import for_, yield_
-from air.backend.xrt_runner import XRTRunner, type_mapper
-from air.backend.xrt import XRTBackend
+from air.backend.xrt_runner import type_mapper
+from utils import (
+    make_l1_memref,
+    tiled_1d_offset,
+    make_air_parser,
+    run_on_npu,
+    check_print_module,
+)
 
 range_ = for_
 
@@ -22,22 +36,14 @@
 @module_builder
 def build_module(n, tile_n, np_dtype_in, arch="aie2"):
     a_size = [n]
-    out_size = a_size
     xrt_dtype_in = type_mapper(np_dtype_in)
     num_tiles = 2
     assert n % (tile_n * num_tiles) == 0
     VECTOR_SIZE = 16
     index_type = IndexType.get()
 
-    # L3 MemRefTypes
     l3memrefTy = MemRefType.get(a_size, xrt_dtype_in)
-
-    # L1 MemRefTypes
-    l1MemrefTy = MemRefType.get(
-        shape=[tile_n],
-        element_type=xrt_dtype_in,
-        memory_space=IntegerAttr.get(T.i32(), MemorySpace.L1),
-    )
+    l1MemrefTy = make_l1_memref([tile_n], xrt_dtype_in)
 
     @FuncOp.from_py_func(l3memrefTy, l3memrefTy)
     def vector_rsqrt(arg0, arg2):
@@ -63,28 +69,12 @@ def herd_body(
             l1_out_data = AllocOp(l1MemrefTy, [], [])
 
             for _l_ivx in range_(0, n, tile_n * num_tiles):
-
-                offset_map = AffineMap.get(
-                    0,
-                    2,
-                    [
-                        AffineExpr.get_add(
-                            AffineSymbolExpr.get(0),
-                            AffineExpr.get_mul(
-                                AffineSymbolExpr.get(1),
-                                AffineConstantExpr.get(tile_n),
-                            ),
-                        )
-                    ],
-                )
-                offset = affine_apply(offset_map, [_l_ivx, _ty])
+                offset = tiled_1d_offset(_l_ivx, _ty, tile_n)
 
                 dma_memcpy_nd(
                     l1_a_data,
                     _l3_a,
-                    src_offsets=[
-                        offset,
-                    ],
+                    src_offsets=[offset],
                     src_sizes=[tile_n],
                     src_strides=[1],
                 )
@@ -121,9 +111,7 @@ def herd_body(
                 dma_memcpy_nd(
                     _l3_c,
                     l1_out_data,
-                    dst_offsets=[
-                        offset,
-                    ],
+                    dst_offsets=[offset],
                     dst_sizes=[tile_n],
                     dst_strides=[1],
                 )
@@ -134,24 +122,12 @@ def herd_body(
 
 
 if __name__ == "__main__":
-    # Default values.
     N = 512
     TILE_N = 64
     INPUT_DATATYPE = np.float32
 
-    parser = argparse.ArgumentParser(
-        prog="run.py",
-        description="Builds, runs, and tests the vector_rsqrt example (Version 2: f32 scalar in loop)",
-    )
-    parser.add_argument(
-        "-v",
-        "--verbose",
-        action="store_true",
-    )
-    parser.add_argument(
-        "-p",
-        "--print-module-only",
-        action="store_true",
+    parser = make_air_parser(
+        "Builds, runs, and tests the vector_rsqrt example (Version 2: f32 scalar in loop)"
     )
     parser.add_argument(
         "--n",
@@ -167,22 +143,6 @@ def herd_body(
         default="aie2",
         help="Target AIE architecture (aie2 or aie2p)",
     )
-    parser.add_argument(
-        "--compile-mode",
-        type=str,
-        choices=["compile-only", "compile-and-run"],
-        dest="compile_mode",
-        default="compile-and-run",
-        help="Configure to whether to run after compile",
-    )
-    parser.add_argument(
-        "--output-format",
-        type=str,
-        choices=["xclbin", "elf"],
-        default="xclbin",
-        dest="output_format",
-        help="Output format for the compiled binary (default: xclbin)",
-    )
 
     args = parser.parse_args()
 
@@ -200,63 +160,34 @@ def herd_body(
         INPUT_DATATYPE,
         args.arch,
     )
-    if args.print_module_only:
-        print(mlir_module)
-        exit(0)
+    check_print_module(mlir_module, args)
 
     # Generate input values in range [0.1, 3.0] to match working testbench pattern
-    # This ensures positive values (required for rsqrt) and stays well within bfloat16 range
     np.random.seed(10)
     input_a = np.abs(np.random.uniform(0.1, 3.0, args.n)).astype(INPUT_DATATYPE)
 
-    if args.compile_mode == "compile-and-run":
-
-        # Stochastically sample num_sample results, and pass to XRTRunner backend for verification.
-        num_samples = 100
-        sampled_indices = np.vstack(
-            [
-                np.random.randint(0, args.n, num_samples),  # i indices
-            ]
-        )
-
-        # Compute reference results for sampled indices
-        sampled_values = np.array(
-            [1.0 / np.sqrt(input_a[i]) for i in sampled_indices[0]],
-            dtype=INPUT_DATATYPE,
-        )
-
-        # Store as a dictionary
-        sampled_data = {
-            "shape": (args.n,),
-            "indices": sampled_indices,
-            "values": sampled_values,
-        }
-
-        ###### Compile and test
-        runner = XRTRunner(
-            verbose=args.verbose,
-            omit_while_true_loop=False,
-            output_format=args.output_format,
+    num_samples = 100
+    sampled_indices = np.vstack(
+        [
+            np.random.randint(0, args.n, num_samples),
+        ]
+    )
+    sampled_values = np.array(
+        [1.0 / np.sqrt(input_a[i]) for i in sampled_indices[0]],
+        dtype=INPUT_DATATYPE,
+    )
+    sampled_data = {
+        "shape": (args.n,),
+        "indices": sampled_indices,
+        "values": sampled_values,
+    }
+    exit(
+        run_on_npu(
+            args,
+            mlir_module,
+            inputs=[input_a],
             instance_name="vector_rsqrt",
-            runtime_loop_tiling_sizes=[4, 4],
+            stochastic_expected_outputs=[sampled_data],
+            rtol=1e-1,
         )
-        exit(
-            runner.run_test(
-                mlir_module,
-                inputs=[input_a],
-                stochastic_expected_outputs=[sampled_data],
-                rtol=1e-1,
-            )
-        )
-
-    elif args.compile_mode == "compile-only":
-        ###### Compile only
-        backend = XRTBackend(
-            verbose=args.verbose,
-            omit_while_true_loop=False,
-            output_format=args.output_format,
-            runtime_loop_tiling_sizes=[4, 4],
-        )
-        module_function = backend.compile(mlir_module)
-
-        backend.unload()
+    )
diff --git a/programming_examples/primitives/vector_examples/vector_rsqrt/vector_rsqrt_v3.py b/programming_examples/primitives/vector_examples/vector_rsqrt/vector_rsqrt_v3.py
index f15f81325..bf5ff4cbc 100644
--- a/programming_examples/primitives/vector_examples/vector_rsqrt/vector_rsqrt_v3.py
+++ b/programming_examples/primitives/vector_examples/vector_rsqrt/vector_rsqrt_v3.py
@@ -1,20 +1,38 @@
 # Copyright (C) 2025, Advanced Micro Devices, Inc.
 # SPDX-License-Identifier: MIT
 # Version 3: bf16 vector rsqrt with f32 conversion
-import argparse
+import os
+import sys
+
+sys.path.insert(
+    0,
+    os.path.dirname(
+        os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+    ),
+)
+
+import numpy as np
 from ml_dtypes import bfloat16
 
 from air.ir import *
-from air.dialects.affine import apply as affine_apply
 from air.dialects.air import *
 from air.dialects.arith import ConstantOp, extf, truncf
-from air.dialects.memref import AllocOp, DeallocOp, load, store, subview
-from air.dialects.vector import transfer_read, transfer_write
+from air.dialects.memref import AllocOp, DeallocOp
 from air.dialects.func import FuncOp
 from air.dialects.math import rsqrt
 from air.dialects.scf import for_, yield_
-from air.backend.xrt_runner import XRTRunner, type_mapper
-from air.backend.xrt import XRTBackend
+from air.backend.xrt_runner import type_mapper
+from utils import (
+    make_l1_memref,
+    make_vec_type,
+    identity_map_1d,
+    tiled_1d_offset,
+    vec_read,
+    vec_write,
+    make_air_parser,
+    run_on_npu,
+    check_print_module,
+)
 
 range_ = for_
 
@@ -22,22 +40,16 @@
 @module_builder
 def build_module(n, tile_n, np_dtype_in, arch="aie2"):
     a_size = [n]
-    out_size = a_size
     xrt_dtype_in = type_mapper(np_dtype_in)
     num_tiles = 2
     assert n % (tile_n * num_tiles) == 0
     VECTOR_SIZE = 16
     index_type = IndexType.get()
 
-    # L3 MemRefTypes
     l3memrefTy = MemRefType.get(a_size, xrt_dtype_in)
-
-    # L1 MemRefTypes
-    l1MemrefTy = MemRefType.get(
-        shape=[tile_n],
-        element_type=xrt_dtype_in,
-        memory_space=IntegerAttr.get(T.i32(), MemorySpace.L1),
-    )
+    l1MemrefTy = make_l1_memref([tile_n], xrt_dtype_in)
+    vecTy = make_vec_type(VECTOR_SIZE, xrt_dtype_in)
+    imap = identity_map_1d()
 
     @FuncOp.from_py_func(l3memrefTy, l3memrefTy)
     def vector_rsqrt(arg0, arg2):
@@ -63,58 +75,23 @@ def herd_body(
             l1_out_data = AllocOp(l1MemrefTy, [], [])
 
             for _l_ivx in range_(0, n, tile_n * num_tiles):
-
-                offset_map = AffineMap.get(
-                    0,
-                    2,
-                    [
-                        AffineExpr.get_add(
-                            AffineSymbolExpr.get(0),
-                            AffineExpr.get_mul(
-                                AffineSymbolExpr.get(1),
-                                AffineConstantExpr.get(tile_n),
-                            ),
-                        )
-                    ],
-                )
-                offset = affine_apply(offset_map, [_l_ivx, _ty])
+                offset = tiled_1d_offset(_l_ivx, _ty, tile_n)
 
                 dma_memcpy_nd(
                     l1_a_data,
                     _l3_a,
-                    src_offsets=[
-                        offset,
-                    ],
+                    src_offsets=[offset],
                     src_sizes=[tile_n],
                     src_strides=[1],
                 )
                 c0 = ConstantOp(index_type, 0)
-                c1 = ConstantOp(index_type, 1)
                 cVecSize = ConstantOp(index_type, VECTOR_SIZE)
                 cTileN = ConstantOp(index_type, tile_n)
+                cst0 = arith.ConstantOp(xrt_dtype_in, 0.0)
                 for j in range_(c0, cTileN, cVecSize):
-                    sub_a_vec = subview(
-                        l1_a_data.result,
-                        [j],
-                        [VECTOR_SIZE],
-                        [1],
-                    )
-                    sub_c_vec = subview(
-                        l1_out_data.result,
-                        [j],
-                        [VECTOR_SIZE],
-                        [1],
-                    )
-
                     # Load bf16 vector
-                    cst0 = arith.ConstantOp(xrt_dtype_in, 0.0)
-                    v_a_bf16 = transfer_read(
-                        VectorType.get([VECTOR_SIZE], xrt_dtype_in),
-                        sub_a_vec,
-                        [c0],
-                        AffineMapAttr.get(AffineMap.get_identity(1)),
-                        cst0,
-                        [True],
+                    v_a_bf16 = vec_read(
+                        l1_a_data, j, VECTOR_SIZE, c0, vecTy, cst0, imap
                     )
 
                     # Extend bf16 to f32
@@ -133,22 +110,13 @@ def herd_body(
                     )
 
                     # Store bf16 vector
-                    transfer_write(
-                        None,
-                        v_c_bf16,
-                        sub_c_vec,
-                        [c0],
-                        AffineMapAttr.get(AffineMap.get_identity(1)),
-                        [True],
-                    )
+                    vec_write(v_c_bf16, l1_out_data, j, VECTOR_SIZE, c0, imap)
                     yield_([])
 
                 dma_memcpy_nd(
                     _l3_c,
                     l1_out_data,
-                    dst_offsets=[
-                        offset,
-                    ],
+                    dst_offsets=[offset],
                     dst_sizes=[tile_n],
                     dst_strides=[1],
                 )
@@ -159,24 +127,12 @@ def herd_body(
 
 
 if __name__ == "__main__":
-    # Default values.
     N = 512
     TILE_N = 64
     INPUT_DATATYPE = bfloat16
 
-    parser = argparse.ArgumentParser(
-        prog="run.py",
-        description="Builds, runs, and tests the vector_rsqrt example (Version 3: bf16 vector with f32 conversion)",
-    )
-    parser.add_argument(
-        "-v",
-        "--verbose",
-        action="store_true",
-    )
-    parser.add_argument(
-        "-p",
-        "--print-module-only",
-        action="store_true",
+    parser = make_air_parser(
+        "Builds, runs, and tests the vector_rsqrt example (Version 3: bf16 vector with f32 conversion)"
     )
     parser.add_argument(
         "--n",
@@ -192,22 +148,6 @@ def herd_body(
         default="aie2",
         help="Target AIE architecture (aie2 or aie2p)",
     )
-    parser.add_argument(
-        "--compile-mode",
-        type=str,
-        choices=["compile-only", "compile-and-run"],
-        dest="compile_mode",
-        default="compile-and-run",
-        help="Configure to whether to run after compile",
-    )
-    parser.add_argument(
-        "--output-format",
-        type=str,
-        choices=["xclbin", "elf"],
-        default="xclbin",
-        dest="output_format",
-        help="Output format for the compiled binary (default: xclbin)",
-    )
 
     args = parser.parse_args()
 
@@ -217,63 +157,34 @@ def herd_body(
         INPUT_DATATYPE,
         args.arch,
     )
-    if args.print_module_only:
-        print(mlir_module)
-        exit(0)
+    check_print_module(mlir_module, args)
 
-    # Generate input values in range [0.1, 3.0] to match working testbench pattern
-    # This ensures positive values (required for rsqrt) and stays well within bfloat16 range
+    # Generate input values in range [0.1, 3.0]
     np.random.seed(10)
     input_a = np.abs(np.random.uniform(0.1, 3.0, args.n)).astype(INPUT_DATATYPE)
 
-    if args.compile_mode == "compile-and-run":
-
-        # Stochastically sample num_sample results, and pass to XRTRunner backend for verification.
-        num_samples = 100
-        sampled_indices = np.vstack(
-            [
-                np.random.randint(0, args.n, num_samples),  # i indices
-            ]
-        )
-
-        # Compute reference results for sampled indices
-        sampled_values = np.array(
-            [1.0 / np.sqrt(input_a[i]) for i in sampled_indices[0]],
-            dtype=INPUT_DATATYPE,
-        )
-
-        # Store as a dictionary
-        sampled_data = {
-            "shape": (args.n,),
-            "indices": sampled_indices,
-            "values": sampled_values,
-        }
-
-        ###### Compile and test
-        runner = XRTRunner(
-            verbose=args.verbose,
-            omit_while_true_loop=False,
-            output_format=args.output_format,
+    num_samples = 100
+    sampled_indices = np.vstack(
+        [
+            np.random.randint(0, args.n, num_samples),
+        ]
+    )
+    sampled_values = np.array(
+        [1.0 / np.sqrt(input_a[i]) for i in sampled_indices[0]],
+        dtype=INPUT_DATATYPE,
+    )
+    sampled_data = {
+        "shape": (args.n,),
+        "indices": sampled_indices,
+        "values": sampled_values,
+    }
+    exit(
+        run_on_npu(
+            args,
+            mlir_module,
+            inputs=[input_a],
             instance_name="vector_rsqrt",
-            runtime_loop_tiling_sizes=[4, 4],
-        )
-        exit(
-            runner.run_test(
-                mlir_module,
-                inputs=[input_a],
-                stochastic_expected_outputs=[sampled_data],
-                rtol=1e-1,
-            )
+            stochastic_expected_outputs=[sampled_data],
+            rtol=1e-1,
         )
-
-    elif args.compile_mode == "compile-only":
-        ###### Compile only
-        backend = XRTBackend(
-            verbose=args.verbose,
-            omit_while_true_loop=False,
-            output_format=args.output_format,
-            runtime_loop_tiling_sizes=[4, 4],
-        )
-        module_function = backend.compile(mlir_module)
-
-        backend.unload()
+    )
diff --git a/programming_examples/primitives/vector_examples/vector_select/vector_select.py b/programming_examples/primitives/vector_examples/vector_select/vector_select.py
index 6f7deb79a..ea6e73067 100644
--- a/programming_examples/primitives/vector_examples/vector_select/vector_select.py
+++ b/programming_examples/primitives/vector_examples/vector_select/vector_select.py
@@ -1,18 +1,36 @@
 # Copyright (C) 2026, Advanced Micro Devices, Inc.
 # SPDX-License-Identifier: MIT
-import argparse
+import os
+import sys
+
+sys.path.insert(
+    0,
+    os.path.dirname(
+        os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+    ),
+)
+
 from ml_dtypes import bfloat16
 
 from air.ir import *
-from air.dialects.affine import apply as affine_apply
 from air.dialects.air import *
 from air.dialects.arith import ConstantOp, cmpf, select, CmpFPredicate
-from air.dialects.memref import AllocOp, DeallocOp, load, store, subview
-from air.dialects.vector import transfer_read, transfer_write
+from air.dialects.memref import AllocOp, DeallocOp
 from air.dialects.func import FuncOp
 from air.dialects.scf import for_, yield_
-from air.backend.xrt_runner import XRTRunner, type_mapper
-from air.backend.xrt import XRTBackend
+from air.backend.xrt_runner import type_mapper
+from utils import (
+    make_l1_memref,
+    make_vec_type,
+    identity_map_1d,
+    tiled_1d_offset,
+    vec_read,
+    vec_write,
+    make_air_parser,
+    run_on_npu,
+    stochastic_check,
+    check_print_module,
+)
 
 import numpy as np
 
@@ -24,23 +42,16 @@
 @module_builder
 def build_module(n, tile_n, np_dtype_in, vector_size=16):
     a_size = [n]
-    b_size = a_size
-    out_size = a_size
     xrt_dtype_in = type_mapper(np_dtype_in)
     num_tiles = 2
     assert n % (tile_n * num_tiles) == 0
     VECTOR_SIZE = vector_size
     index_type = IndexType.get()
 
-    # L3 MemRefTypes
     l3memrefTy = MemRefType.get(a_size, xrt_dtype_in)
-
-    # L1 MemRefTypes
-    l1MemrefTy = MemRefType.get(
-        shape=[tile_n],
-        element_type=xrt_dtype_in,
-        memory_space=IntegerAttr.get(T.i32(), MemorySpace.L1),
-    )
+    l1MemrefTy = make_l1_memref([tile_n], xrt_dtype_in)
+    vecTy = make_vec_type(VECTOR_SIZE, xrt_dtype_in)
+    imap = identity_map_1d()
 
     @FuncOp.from_py_func(l3memrefTy, l3memrefTy, l3memrefTy)
     def vector_select(arg0, arg1, arg2):
@@ -63,100 +74,40 @@ def herd_body(
             l1_out_data = AllocOp(l1MemrefTy, [], [])
 
             for _l_ivx in range_(0, n, tile_n * num_tiles):
-
-                offset_map = AffineMap.get(
-                    0,
-                    2,
-                    [
-                        AffineExpr.get_add(
-                            AffineSymbolExpr.get(0),
-                            AffineExpr.get_mul(
-                                AffineSymbolExpr.get(1),
-                                AffineConstantExpr.get(tile_n),
-                            ),
-                        )
-                    ],
-                )
-                offset = affine_apply(offset_map, [_l_ivx, _ty])
+                offset = tiled_1d_offset(_l_ivx, _ty, tile_n)
 
                 dma_memcpy_nd(
                     l1_a_data,
                     _l3_a,
-                    src_offsets=[
-                        offset,
-                    ],
+                    src_offsets=[offset],
                     src_sizes=[tile_n],
                     src_strides=[1],
                 )
                 dma_memcpy_nd(
                     l1_b_data,
                     _l3_b,
-                    src_offsets=[
-                        offset,
-                    ],
+                    src_offsets=[offset],
                     src_sizes=[tile_n],
                     src_strides=[1],
                 )
                 c0 = ConstantOp(index_type, 0)
-                c1 = ConstantOp(index_type, 1)
                 cVecSize = ConstantOp(index_type, VECTOR_SIZE)
                 cTileN = ConstantOp(index_type, tile_n)
+                cst0 = arith.ConstantOp(xrt_dtype_in, 0.0)
                 for j in range_(c0, cTileN, cVecSize):
-                    sub_a_vec = subview(
-                        l1_a_data.result,
-                        [j],
-                        [VECTOR_SIZE],
-                        [1],
-                    )
-                    sub_b_vec = subview(
-                        l1_b_data.result,
-                        [j],
-                        [VECTOR_SIZE],
-                        [1],
-                    )
-                    sub_c_vec = subview(
-                        l1_out_data.result,
-                        [j],
-                        [VECTOR_SIZE],
-                        [1],
-                    )
-                    cst0 = arith.ConstantOp(xrt_dtype_in, 0.0)
-                    v_a = transfer_read(
-                        VectorType.get([VECTOR_SIZE], xrt_dtype_in),
-                        sub_a_vec,
-                        [c0],
-                        AffineMapAttr.get(AffineMap.get_identity(1)),
-                        cst0,
-                        [True],
-                    )
-                    v_b = transfer_read(
-                        VectorType.get([VECTOR_SIZE], xrt_dtype_in),
-                        sub_b_vec,
-                        [c0],
-                        AffineMapAttr.get(AffineMap.get_identity(1)),
-                        cst0,
-                        [True],
-                    )
+                    v_a = vec_read(l1_a_data, j, VECTOR_SIZE, c0, vecTy, cst0, imap)
+                    v_b = vec_read(l1_b_data, j, VECTOR_SIZE, c0, vecTy, cst0, imap)
                     # Compare: a >= b (ordered greater-or-equal)
                     cmp_result = cmpf(CmpFPredicate.OGE, v_a, v_b)
                     # Select: when cmp is true (a >= b), pick a; otherwise pick b
                     v_c = select(cmp_result, v_a, v_b)
-                    transfer_write(
-                        None,
-                        v_c,
-                        sub_c_vec,
-                        [c0],
-                        AffineMapAttr.get(AffineMap.get_identity(1)),
-                        [True],
-                    )
+                    vec_write(v_c, l1_out_data, j, VECTOR_SIZE, c0, imap)
                     yield_([])
 
                 dma_memcpy_nd(
                     _l3_c,
                     l1_out_data,
-                    dst_offsets=[
-                        offset,
-                    ],
+                    dst_offsets=[offset],
                     dst_sizes=[tile_n],
                     dst_strides=[1],
                 )
@@ -168,26 +119,12 @@ def herd_body(
 
 
 if __name__ == "__main__":
-    # Default values.
     N = 65536
     TILE_N = 1024
     VECTOR_SIZE = 16
     INPUT_DATATYPE = bfloat16
 
-    parser = argparse.ArgumentParser(
-        prog="run.py",
-        description="Builds, runs, and tests the vector_select example",
-    )
-    parser.add_argument(
-        "-v",
-        "--verbose",
-        action="store_true",
-    )
-    parser.add_argument(
-        "-p",
-        "--print-module-only",
-        action="store_true",
-    )
+    parser = make_air_parser("Builds, runs, and tests the vector_select example")
     parser.add_argument(
         "--n",
         type=int,
@@ -201,22 +138,6 @@ def herd_body(
         default=VECTOR_SIZE,
         help="Vector size for SIMD operations",
     )
-    parser.add_argument(
-        "--compile-mode",
-        type=str,
-        choices=["compile-only", "compile-and-run"],
-        dest="compile_mode",
-        default="compile-and-run",
-        help="Configure to whether to run after compile",
-    )
-    parser.add_argument(
-        "--output-format",
-        type=str,
-        choices=["xclbin", "elf"],
-        default="xclbin",
-        dest="output_format",
-        help="Output format for the compiled binary (default: xclbin)",
-    )
     parser.add_argument(
         "--bf16-emulation",
         dest="bf16_emulation",
@@ -237,63 +158,25 @@ def herd_body(
         INPUT_DATATYPE,
         args.vector_size,
     )
-    if args.print_module_only:
-        print(mlir_module)
-        exit(0)
+    check_print_module(mlir_module, args)
 
     input_a = np.random.uniform(-100.0, 100.0, args.n).astype(INPUT_DATATYPE)
     input_b = np.random.uniform(-100.0, 100.0, args.n).astype(INPUT_DATATYPE)
 
-    if args.compile_mode == "compile-and-run":
-
-        # Stochastically sample num_sample results, and pass to XRTRunner backend for verification.
-        num_samples = 100
-        sampled_indices = np.vstack(
-            [
-                np.random.randint(0, args.n, num_samples),  # i indices
-            ]
-        )
-
-        # Compute reference results for sampled indices
-        sampled_values = np.array(
-            [max(input_a[i], input_b[i]) for i in zip(*sampled_indices)],
-            dtype=INPUT_DATATYPE,
-        )
-
-        # Store as a dictionary
-        sampled_data = {
-            "shape": (args.n,),
-            "indices": sampled_indices,
-            "values": sampled_values,
-        }
-
-        ###### Compile and test
-        runner = XRTRunner(
-            verbose=args.verbose,
-            omit_while_true_loop=False,
-            output_format=args.output_format,
+    sampled_data = stochastic_check(
+        [input_a, input_b],
+        args.n,
+        lambda a, b: max(a, b),
+        INPUT_DATATYPE,
+    )
+    exit(
+        run_on_npu(
+            args,
+            mlir_module,
+            inputs=[input_a, input_b],
             instance_name="vector_select",
+            stochastic_expected_outputs=[sampled_data],
+            rtol=5e-2 if bf16_emulation else 1e-3,
             bf16_emulation=bf16_emulation,
-            runtime_loop_tiling_sizes=[4, 4],
-        )
-        exit(
-            runner.run_test(
-                mlir_module,
-                inputs=[input_a, input_b],
-                stochastic_expected_outputs=[sampled_data],
-                rtol=5e-2 if bf16_emulation else 1e-3,
-            )
-        )
-
-    elif args.compile_mode == "compile-only":
-        ###### Compile only
-        backend = XRTBackend(
-            verbose=args.verbose,
-            omit_while_true_loop=False,
-            output_format=args.output_format,
-            bf16_emulation=bf16_emulation,
-            runtime_loop_tiling_sizes=[4, 4],
         )
-        module_function = backend.compile(mlir_module)
-
-        backend.unload()
+    )
diff --git a/programming_examples/primitives/vector_examples/vector_sub/vector_sub.py b/programming_examples/primitives/vector_examples/vector_sub/vector_sub.py
index 9403059e8..b1ae243db 100644
--- a/programming_examples/primitives/vector_examples/vector_sub/vector_sub.py
+++ b/programming_examples/primitives/vector_examples/vector_sub/vector_sub.py
@@ -1,18 +1,36 @@
 # Copyright (C) 2025, Advanced Micro Devices, Inc.
 # SPDX-License-Identifier: MIT
-import argparse
+import os
+import sys
+
+sys.path.insert(
+    0,
+    os.path.dirname(
+        os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+    ),
+)
+
 from ml_dtypes import bfloat16
 
 from air.ir import *
-from air.dialects.affine import apply as affine_apply
 from air.dialects.air import *
 from air.dialects.arith import ConstantOp
-from air.dialects.memref import AllocOp, DeallocOp, load, store, subview
-from air.dialects.vector import transfer_read, transfer_write
+from air.dialects.memref import AllocOp, DeallocOp
 from air.dialects.func import FuncOp
 from air.dialects.scf import for_, yield_
-from air.backend.xrt_runner import XRTRunner, type_mapper
-from air.backend.xrt import XRTBackend
+from air.backend.xrt_runner import type_mapper
+from utils import (
+    make_l1_memref,
+    make_vec_type,
+    identity_map_1d,
+    tiled_1d_offset,
+    vec_read,
+    vec_write,
+    make_air_parser,
+    run_on_npu,
+    stochastic_check,
+    check_print_module,
+)
 
 import numpy as np
 
@@ -24,23 +42,16 @@
 @module_builder
 def build_module(n, tile_n, np_dtype_in, vector_size=16):
     a_size = [n]
-    b_size = a_size
-    out_size = a_size
     xrt_dtype_in = type_mapper(np_dtype_in)
     num_tiles = 2
     assert n % (tile_n * num_tiles) == 0
     VECTOR_SIZE = vector_size
     index_type = IndexType.get()
 
-    # L3 MemRefTypes
     l3memrefTy = MemRefType.get(a_size, xrt_dtype_in)
-
-    # L1 MemRefTypes
-    l1MemrefTy = MemRefType.get(
-        shape=[tile_n],
-        element_type=xrt_dtype_in,
-        memory_space=IntegerAttr.get(T.i32(), MemorySpace.L1),
-    )
+    l1MemrefTy = make_l1_memref([tile_n], xrt_dtype_in)
+    vecTy = make_vec_type(VECTOR_SIZE, xrt_dtype_in)
+    imap = identity_map_1d()
 
     @FuncOp.from_py_func(l3memrefTy, l3memrefTy, l3memrefTy)
     def vector_sub(arg0, arg1, arg2):
@@ -63,97 +74,37 @@ def herd_body(
             l1_out_data = AllocOp(l1MemrefTy, [], [])
 
             for _l_ivx in range_(0, n, tile_n * num_tiles):
-
-                offset_map = AffineMap.get(
-                    0,
-                    2,
-                    [
-                        AffineExpr.get_add(
-                            AffineSymbolExpr.get(0),
-                            AffineExpr.get_mul(
-                                AffineSymbolExpr.get(1),
-                                AffineConstantExpr.get(tile_n),
-                            ),
-                        )
-                    ],
-                )
-                offset = affine_apply(offset_map, [_l_ivx, _ty])
+                offset = tiled_1d_offset(_l_ivx, _ty, tile_n)
 
                 dma_memcpy_nd(
                     l1_a_data,
                     _l3_a,
-                    src_offsets=[
-                        offset,
-                    ],
+                    src_offsets=[offset],
                     src_sizes=[tile_n],
                     src_strides=[1],
                 )
                 dma_memcpy_nd(
                     l1_b_data,
                     _l3_b,
-                    src_offsets=[
-                        offset,
-                    ],
+                    src_offsets=[offset],
                     src_sizes=[tile_n],
                     src_strides=[1],
                 )
                 c0 = ConstantOp(index_type, 0)
-                c1 = ConstantOp(index_type, 1)
                 cVecSize = ConstantOp(index_type, VECTOR_SIZE)
                 cTileN = ConstantOp(index_type, tile_n)
+                cst0 = arith.ConstantOp(xrt_dtype_in, 0.0)
                 for j in range_(c0, cTileN, cVecSize):
-                    sub_a_vec = subview(
-                        l1_a_data.result,
-                        [j],
-                        [VECTOR_SIZE],
-                        [1],
-                    )
-                    sub_b_vec = subview(
-                        l1_b_data.result,
-                        [j],
-                        [VECTOR_SIZE],
-                        [1],
-                    )
-                    sub_c_vec = subview(
-                        l1_out_data.result,
-                        [j],
-                        [VECTOR_SIZE],
-                        [1],
-                    )
-                    cst0 = arith.ConstantOp(xrt_dtype_in, 0.0)
-                    v_a = transfer_read(
-                        VectorType.get([VECTOR_SIZE], xrt_dtype_in),
-                        sub_a_vec,
-                        [c0],
-                        AffineMapAttr.get(AffineMap.get_identity(1)),
-                        cst0,
-                        [True],
-                    )
-                    v_b = transfer_read(
-                        VectorType.get([VECTOR_SIZE], xrt_dtype_in),
-                        sub_b_vec,
-                        [c0],
-                        AffineMapAttr.get(AffineMap.get_identity(1)),
-                        cst0,
-                        [True],
-                    )
+                    v_a = vec_read(l1_a_data, j, VECTOR_SIZE, c0, vecTy, cst0, imap)
+                    v_b = vec_read(l1_b_data, j, VECTOR_SIZE, c0, vecTy, cst0, imap)
                     v_c = arith.SubFOp(v_a, v_b)
-                    transfer_write(
-                        None,
-                        v_c,
-                        sub_c_vec,
-                        [c0],
-                        AffineMapAttr.get(AffineMap.get_identity(1)),
-                        [True],
-                    )
+                    vec_write(v_c, l1_out_data, j, VECTOR_SIZE, c0, imap)
                     yield_([])
 
                 dma_memcpy_nd(
                     _l3_c,
                     l1_out_data,
-                    dst_offsets=[
-                        offset,
-                    ],
+                    dst_offsets=[offset],
                     dst_sizes=[tile_n],
                     dst_strides=[1],
                 )
@@ -165,26 +116,12 @@ def herd_body(
 
 
 if __name__ == "__main__":
-    # Default values.
     N = 65536
     TILE_N = 1024
     VECTOR_SIZE = 16
     INPUT_DATATYPE = bfloat16
 
-    parser = argparse.ArgumentParser(
-        prog="run.py",
-        description="Builds, runs, and tests the passthrough_dma example",
-    )
-    parser.add_argument(
-        "-v",
-        "--verbose",
-        action="store_true",
-    )
-    parser.add_argument(
-        "-p",
-        "--print-module-only",
-        action="store_true",
-    )
+    parser = make_air_parser("Builds, runs, and tests the passthrough_dma example")
     parser.add_argument(
         "--n",
         type=int,
@@ -198,22 +135,6 @@ def herd_body(
         default=VECTOR_SIZE,
         help="Vector size for SIMD operations",
     )
-    parser.add_argument(
-        "--compile-mode",
-        type=str,
-        choices=["compile-only", "compile-and-run"],
-        dest="compile_mode",
-        default="compile-and-run",
-        help="Configure to whether to run after compile",
-    )
-    parser.add_argument(
-        "--output-format",
-        type=str,
-        choices=["xclbin", "elf"],
-        default="xclbin",
-        dest="output_format",
-        help="Output format for the compiled binary (default: xclbin)",
-    )
     parser.add_argument(
         "--bf16-emulation",
         dest="bf16_emulation",
@@ -234,65 +155,24 @@ def herd_body(
         INPUT_DATATYPE,
         args.vector_size,
     )
-    if args.print_module_only:
-        print(mlir_module)
-        exit(0)
+    check_print_module(mlir_module, args)
 
     input_a = np.arange(0, args.n, dtype=np.int64)
     input_a = input_a.astype(INPUT_DATATYPE)
     input_b = np.arange(0, args.n, dtype=np.int64)
     input_b = input_b.astype(INPUT_DATATYPE)
 
-    if args.compile_mode == "compile-and-run":
-
-        # Stochastically sample num_sample results, and pass to XRTRunner backend for verification.
-        num_samples = 100
-        sampled_indices = np.vstack(
-            [
-                np.random.randint(0, args.n, num_samples),  # i indices
-            ]
-        )
-
-        # Compute reference results for sampled indices
-        sampled_values = np.array(
-            [input_a[i] - input_b[i] for i in zip(*sampled_indices)],
-            dtype=INPUT_DATATYPE,
-        )
-
-        # Store as a dictionary
-        sampled_data = {
-            "shape": (args.n,),
-            "indices": sampled_indices,
-            "values": sampled_values,
-        }
-
-        ###### Compile and test
-        runner = XRTRunner(
-            verbose=args.verbose,
-            omit_while_true_loop=False,
-            output_format=args.output_format,
+    sampled_data = stochastic_check(
+        [input_a, input_b], args.n, lambda a, b: a - b, INPUT_DATATYPE
+    )
+    exit(
+        run_on_npu(
+            args,
+            mlir_module,
+            inputs=[input_a, input_b],
             instance_name="vector_sub",
+            stochastic_expected_outputs=[sampled_data],
+            rtol=5e-2 if bf16_emulation else 1e-3,
             bf16_emulation=bf16_emulation,
-            runtime_loop_tiling_sizes=[4, 4],
-        )
-        exit(
-            runner.run_test(
-                mlir_module,
-                inputs=[input_a, input_b],
-                stochastic_expected_outputs=[sampled_data],
-                rtol=5e-2 if bf16_emulation else 1e-3,
-            )
         )
-
-    elif args.compile_mode == "compile-only":
-        ###### Compile only
-        backend = XRTBackend(
-            verbose=args.verbose,
-            omit_while_true_loop=False,
-            output_format=args.output_format,
-            bf16_emulation=bf16_emulation,
-            runtime_loop_tiling_sizes=[4, 4],
-        )
-        module_function = backend.compile(mlir_module)
-
-        backend.unload()
+    )
diff --git a/programming_examples/primitives/vector_examples/vector_tanh/vector_tanh.py b/programming_examples/primitives/vector_examples/vector_tanh/vector_tanh.py
index d5fae8bda..f3435b0db 100644
--- a/programming_examples/primitives/vector_examples/vector_tanh/vector_tanh.py
+++ b/programming_examples/primitives/vector_examples/vector_tanh/vector_tanh.py
@@ -12,21 +12,39 @@
 Computation is vectorized using vector.transfer_read/write.
 """
 
-import argparse
+import os
+import sys
+
+sys.path.insert(
+    0,
+    os.path.dirname(
+        os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+    ),
+)
+
 import numpy as np
 from ml_dtypes import bfloat16
 
 from air.ir import *
-from air.dialects.affine import apply as affine_apply
 from air.dialects.air import *
 from air.dialects import arith, math as math_dialect
 from air.dialects.arith import ConstantOp
-from air.dialects.memref import AllocOp, DeallocOp, subview
-from air.dialects.vector import transfer_read, transfer_write
+from air.dialects.memref import AllocOp, DeallocOp
 from air.dialects.func import FuncOp
 from air.dialects.scf import for_, yield_
-from air.backend.xrt_runner import XRTRunner, type_mapper
-from air.backend.xrt import XRTBackend
+from air.backend.xrt_runner import type_mapper
+from utils import (
+    make_l1_memref,
+    make_vec_type,
+    identity_map_1d,
+    tiled_1d_offset,
+    vec_read,
+    vec_write,
+    make_air_parser,
+    run_on_npu,
+    stochastic_check,
+    check_print_module,
+)
 
 range_ = for_
 
@@ -41,14 +59,9 @@ def build_module(n, tile_n, np_dtype_in, vector_size=16):
     index_type = IndexType.get()
 
     l3memrefTy = MemRefType.get([n], xrt_dtype_in)
-    l1MemrefTy = MemRefType.get(
-        shape=[tile_n],
-        element_type=xrt_dtype_in,
-        memory_space=IntegerAttr.get(T.i32(), MemorySpace.L1),
-    )
-
-    vecTy = VectorType.get([VECTOR_SIZE], xrt_dtype_in)
-    identity_map = AffineMapAttr.get(AffineMap.get_identity(1))
+    l1MemrefTy = make_l1_memref([tile_n], xrt_dtype_in)
+    vecTy = make_vec_type(VECTOR_SIZE, xrt_dtype_in)
+    imap = identity_map_1d()
 
     @FuncOp.from_py_func(l3memrefTy, l3memrefTy)
     def vector_tanh(arg0, arg1):
@@ -59,20 +72,7 @@ def herd_body(_tx, _ty, _sx, _sy, _l3_in, _l3_out):
             l1_out = AllocOp(l1MemrefTy, [], [])
 
             for _l_ivx in range_(0, n, tile_n * num_tiles):
-                offset_map = AffineMap.get(
-                    0,
-                    2,
-                    [
-                        AffineExpr.get_add(
-                            AffineSymbolExpr.get(0),
-                            AffineExpr.get_mul(
-                                AffineSymbolExpr.get(1),
-                                AffineConstantExpr.get(tile_n),
-                            ),
-                        )
-                    ],
-                )
-                offset = affine_apply(offset_map, [_l_ivx, _ty])
+                offset = tiled_1d_offset(_l_ivx, _ty, tile_n)
 
                 dma_memcpy_nd(
                     l1_in,
@@ -88,17 +88,12 @@ def herd_body(_tx, _ty, _sx, _sy, _l3_in, _l3_out):
                 cst0 = arith.ConstantOp(xrt_dtype_in, 0.0)
 
                 for j in range_(c0, cTileN, cVecSize):
-                    sub_in = subview(l1_in.result, [j], [VECTOR_SIZE], [1])
-                    sub_out = subview(l1_out.result, [j], [VECTOR_SIZE], [1])
-
-                    v_in = transfer_read(
-                        vecTy, sub_in, [c0], identity_map, cst0, [True]
-                    )
+                    v_in = vec_read(l1_in, j, VECTOR_SIZE, c0, vecTy, cst0, imap)
 
                     # Hardware tanh intrinsic on AIE2P
                     v_out = math_dialect.tanh(v_in)
 
-                    transfer_write(None, v_out, sub_out, [c0], identity_map, [True])
+                    vec_write(v_out, l1_out, j, VECTOR_SIZE, c0, imap)
                     yield_([])
 
                 dma_memcpy_nd(
@@ -119,12 +114,7 @@ def herd_body(_tx, _ty, _sx, _sy, _l3_in, _l3_out):
     VECTOR_SIZE = 16
     INPUT_DATATYPE = bfloat16
 
-    parser = argparse.ArgumentParser(
-        prog="run.py",
-        description="Builds, runs, and tests the vectorized tanh example",
-    )
-    parser.add_argument("-v", "--verbose", action="store_true")
-    parser.add_argument("-p", "--print-module-only", action="store_true")
+    parser = make_air_parser("Builds, runs, and tests the vectorized tanh example")
     parser.add_argument("--n", type=int, default=N, help="Total number of elements")
     parser.add_argument("--tile-n", type=int, default=TILE_N, help="Tile size")
     parser.add_argument(
@@ -140,69 +130,28 @@ def herd_body(_tx, _ty, _sx, _sy, _l3_in, _l3_out):
         default="aie2p",
         help="Target AIE architecture (aie2 or aie2p)",
     )
-    parser.add_argument(
-        "--compile-mode",
-        type=str,
-        choices=["compile-only", "compile-and-run"],
-        dest="compile_mode",
-        default="compile-and-run",
-    )
-    parser.add_argument(
-        "--output-format",
-        type=str,
-        choices=["xclbin", "elf"],
-        default="xclbin",
-        dest="output_format",
-    )
 
     args = parser.parse_args()
 
     mlir_module = build_module(args.n, args.tile_n, INPUT_DATATYPE, args.vector_size)
-    if args.print_module_only:
-        print(mlir_module)
-        exit(0)
+    check_print_module(mlir_module, args)
 
     np.random.seed(42)
     input_a = np.random.uniform(-4.0, 4.0, args.n).astype(INPUT_DATATYPE)
 
-    if args.compile_mode == "compile-and-run":
-        num_samples = 100
-        sampled_indices = np.vstack([np.random.randint(0, args.n, num_samples)])
+    # Reference: compute tanh in f32 precision
+    def tanh_ref(x):
+        return np.tanh(x.astype(np.float32))
 
-        # Reference: compute tanh in f32 precision
-        sampled_values = np.array(
-            [np.tanh(input_a[i].astype(np.float32)) for i in zip(*sampled_indices)],
-            dtype=INPUT_DATATYPE,
-        )
-        sampled_data = {
-            "shape": (args.n,),
-            "indices": sampled_indices,
-            "values": sampled_values,
-        }
-
-        runner = XRTRunner(
-            verbose=args.verbose,
-            omit_while_true_loop=False,
-            output_format=args.output_format,
+    sampled_data = stochastic_check([input_a], args.n, tanh_ref, INPUT_DATATYPE)
+    exit(
+        run_on_npu(
+            args,
+            mlir_module,
+            inputs=[input_a],
             instance_name="vector_tanh",
-            runtime_loop_tiling_sizes=[4, 4],
+            stochastic_expected_outputs=[sampled_data],
+            rtol=1e-1,
+            atol=5e-2,
         )
-        exit(
-            runner.run_test(
-                mlir_module,
-                inputs=[input_a],
-                stochastic_expected_outputs=[sampled_data],
-                rtol=1e-1,
-                atol=5e-2,
-            )
-        )
-
-    elif args.compile_mode == "compile-only":
-        backend = XRTBackend(
-            verbose=args.verbose,
-            omit_while_true_loop=False,
-            output_format=args.output_format,
-            runtime_loop_tiling_sizes=[4, 4],
-        )
-        module_function = backend.compile(mlir_module)
-        backend.unload()
+    )
diff --git a/programming_examples/relu/relu.py b/programming_examples/relu/relu.py
index 69b23c301..7a4727167 100644
--- a/programming_examples/relu/relu.py
+++ b/programming_examples/relu/relu.py
@@ -11,23 +11,26 @@
 configurable VECTOR_SIZE (default 16).
 """
 
-import argparse
+import os
+import sys
+
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
 import numpy as np
 
 np.random.seed(42)
 from ml_dtypes import bfloat16
 
 from air.ir import *
-from air.dialects.affine import apply as affine_apply
 from air.dialects.air import *
 from air.dialects import arith
 from air.dialects.arith import ConstantOp
-from air.dialects.memref import AllocOp, DeallocOp, subview
-from air.dialects.vector import transfer_read, transfer_write, BroadcastOp
+from air.dialects.memref import AllocOp, DeallocOp
+from air.dialects.vector import BroadcastOp
 from air.dialects.func import FuncOp
 from air.dialects.scf import for_, yield_
-from air.backend.xrt_runner import XRTRunner, type_mapper
-from air.backend.xrt import XRTBackend
+from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu
+from utils import vec_read, vec_write
 
 range_ = for_
 
@@ -41,18 +44,10 @@ def build_module(n, tile_n, np_dtype_in, vector_size=16):
     VECTOR_SIZE = vector_size
     index_type = IndexType.get()
 
-    # L3 MemRefTypes
     l3memrefTy = MemRefType.get([n], xrt_dtype_in)
-
-    # L1 MemRefTypes
-    l1MemrefTy = MemRefType.get(
-        shape=[tile_n],
-        element_type=xrt_dtype_in,
-        memory_space=IntegerAttr.get(T.i32(), MemorySpace.L1),
-    )
-
-    vecTy = VectorType.get([VECTOR_SIZE], xrt_dtype_in)
-    identity_map = AffineMapAttr.get(AffineMap.get_identity(1))
+    l1MemrefTy = l1_memref_type([tile_n], xrt_dtype_in)
+    vecTy = vec_type(VECTOR_SIZE, xrt_dtype_in)
+    imap = identity_map_attr()
 
     @FuncOp.from_py_func(l3memrefTy, l3memrefTy)
     def relu(arg0, arg1):
@@ -73,21 +68,7 @@ def herd_body(
             l1_out_data = AllocOp(l1MemrefTy, [], [])
 
             for _l_ivx in range_(0, n, tile_n * num_tiles):
-
-                offset_map = AffineMap.get(
-                    0,
-                    2,
-                    [
-                        AffineExpr.get_add(
-                            AffineSymbolExpr.get(0),
-                            AffineExpr.get_mul(
-                                AffineSymbolExpr.get(1),
-                                AffineConstantExpr.get(tile_n),
-                            ),
-                        )
-                    ],
-                )
-                offset = affine_apply(offset_map, [_l_ivx, _ty])
+                offset = tile_offset_1d(_l_ivx, _ty, tile_n)
 
                 dma_memcpy_nd(
                     l1_in_data,
@@ -104,24 +85,10 @@ def herd_body(
                 v_zero = BroadcastOp(vecTy, cst0)
 
                 for j in range_(c0, cTileN, cVecSize):
-                    sub_in = subview(
-                        l1_in_data.result,
-                        [j],
-                        [VECTOR_SIZE],
-                        [1],
-                    )
-                    sub_out = subview(
-                        l1_out_data.result,
-                        [j],
-                        [VECTOR_SIZE],
-                        [1],
-                    )
-                    v_in = transfer_read(
-                        vecTy, sub_in, [c0], identity_map, cst0, [True]
-                    )
+                    v_in = vec_read(l1_in_data, j, VECTOR_SIZE, c0, vecTy, cst0, imap)
                     # RELU: max(x, 0) using arith.maximumf on bf16
                     v_relu = arith.MaximumFOp(v_in, v_zero)
-                    transfer_write(None, v_relu, sub_out, [c0], identity_map, [True])
+                    vec_write(v_relu, l1_out_data, j, VECTOR_SIZE, c0, imap)
                     yield_([])
 
                 dma_memcpy_nd(
@@ -142,12 +109,7 @@ def herd_body(
     TILE_N = 1024
     INPUT_DATATYPE = bfloat16
 
-    parser = argparse.ArgumentParser(
-        prog="run.py",
-        description="Builds, runs, and tests the RELU example",
-    )
-    parser.add_argument("-v", "--verbose", action="store_true")
-    parser.add_argument("-p", "--print-module-only", action="store_true")
+    parser = make_air_parser("Builds, runs, and tests the RELU example")
     parser.add_argument("--n", type=int, default=N, help="Total number of elements")
     parser.add_argument("--tile-n", type=int, default=TILE_N, help="Tile size")
     parser.add_argument(
@@ -156,20 +118,6 @@ def herd_body(
         default=16,
         help="Vector size for SIMD operations",
     )
-    parser.add_argument(
-        "--compile-mode",
-        type=str,
-        choices=["compile-only", "compile-and-run"],
-        dest="compile_mode",
-        default="compile-and-run",
-    )
-    parser.add_argument(
-        "--output-format",
-        type=str,
-        choices=["xclbin", "elf"],
-        default="xclbin",
-        dest="output_format",
-    )
 
     args = parser.parse_args()
 
@@ -181,41 +129,24 @@ def herd_body(
     # Mix of positive and negative values for RELU testing
     input_a = np.random.randn(args.n).astype(INPUT_DATATYPE)
 
-    if args.compile_mode == "compile-and-run":
-        num_samples = 100
-        sampled_indices = np.vstack([np.random.randint(0, args.n, num_samples)])
-        sampled_values = np.array(
-            [np.maximum(input_a[i], 0) for i in zip(*sampled_indices)],
-            dtype=INPUT_DATATYPE,
-        )
-        sampled_data = {
-            "shape": (args.n,),
-            "indices": sampled_indices,
-            "values": sampled_values,
-        }
-
-        runner = XRTRunner(
-            verbose=args.verbose,
-            omit_while_true_loop=False,
-            output_format=args.output_format,
+    sampled_indices = np.vstack([np.random.randint(0, args.n, 100)])
+    sampled_values = np.array(
+        [np.maximum(input_a[i], 0) for i in zip(*sampled_indices)],
+        dtype=INPUT_DATATYPE,
+    )
+    sampled_data = {
+        "shape": (args.n,),
+        "indices": sampled_indices,
+        "values": sampled_values,
+    }
+
+    exit(
+        run_on_npu(
+            args,
+            mlir_module,
+            inputs=[input_a],
             instance_name="relu",
-            runtime_loop_tiling_sizes=[4, 4],
+            stochastic_expected_outputs=[sampled_data],
+            rtol=1e-2,
         )
-        exit(
-            runner.run_test(
-                mlir_module,
-                inputs=[input_a],
-                stochastic_expected_outputs=[sampled_data],
-                rtol=1e-2,
-            )
-        )
-
-    elif args.compile_mode == "compile-only":
-        backend = XRTBackend(
-            verbose=args.verbose,
-            omit_while_true_loop=False,
-            output_format=args.output_format,
-            runtime_loop_tiling_sizes=[4, 4],
-        )
-        module_function = backend.compile(mlir_module)
-        backend.unload()
+    )
diff --git a/programming_examples/rms_norm/rms_norm.py b/programming_examples/rms_norm/rms_norm.py
index 96fab5328..4e52dfaf2 100644
--- a/programming_examples/rms_norm/rms_norm.py
+++ b/programming_examples/rms_norm/rms_norm.py
@@ -13,7 +13,7 @@
 configurable VECTOR_SIZE (default 16 for AIE2).
 """
 
-import argparse
+import numpy as np
 from ml_dtypes import bfloat16
 
 from air.ir import *
@@ -28,8 +28,7 @@
 )
 from air.dialects.func import FuncOp
 from air.dialects.scf import for_, yield_
-from air.backend.xrt_runner import XRTRunner, type_mapper
-from air.backend.xrt import XRTBackend
+from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu
 
 range_ = for_
 
@@ -43,16 +42,15 @@ def build_module(M, N, np_dtype, vector_size=16):
         N % vector_size == 0
     ), f"N ({N}) must be divisible by vector_size ({vector_size})"
 
-    vecTy = VectorType.get([vector_size], xrt_dtype)
-    identity_map = AffineMapAttr.get(AffineMap.get_identity(1))
+    vecTy = vec_type(vector_size, xrt_dtype)
+    identity_map = identity_map_attr()
 
     # L3 types
     l3MemrefTy = MemRefType.get([M, N], xrt_dtype)
 
     # L1 types
-    l1_mem_space = IntegerAttr.get(T.i32(), MemorySpace.L1)
-    l1RowTy = MemRefType.get([N], xrt_dtype, memory_space=l1_mem_space)
-    l1VecTy = MemRefType.get([vector_size], xrt_dtype, memory_space=l1_mem_space)
+    l1RowTy = l1_memref_type([N], xrt_dtype)
+    l1VecTy = l1_memref_type([vector_size], xrt_dtype)
 
     @FuncOp.from_py_func(l3MemrefTy, l3MemrefTy)
     def rms_norm(arg0, arg1):
@@ -155,12 +153,7 @@ def herd_body(_tx, _ty, _sx, _sy, l3_in, l3_out):
     VECTOR_SIZE = 16
     INPUT_DATATYPE = bfloat16
 
-    parser = argparse.ArgumentParser(
-        prog="run.py",
-        description="Builds, runs, and tests the RMS normalization example",
-    )
-    parser.add_argument("-v", "--verbose", action="store_true")
-    parser.add_argument("-p", "--print-module-only", action="store_true")
+    parser = make_air_parser("Builds, runs, and tests the RMS normalization example")
     parser.add_argument("--M", type=int, default=M_DEFAULT, help="M dimension (rows)")
     parser.add_argument("--N", type=int, default=N_DEFAULT, help="N dimension (cols)")
     parser.add_argument(
@@ -169,20 +162,6 @@ def herd_body(_tx, _ty, _sx, _sy, l3_in, l3_out):
         default=VECTOR_SIZE,
         help="Vector size for SIMD operations",
     )
-    parser.add_argument(
-        "--compile-mode",
-        type=str,
-        choices=["compile-only", "compile-and-run"],
-        dest="compile_mode",
-        default="compile-and-run",
-    )
-    parser.add_argument(
-        "--output-format",
-        type=str,
-        choices=["xclbin", "elf"],
-        default="xclbin",
-        dest="output_format",
-    )
     args = parser.parse_args()
 
     mlir_module = build_module(args.M, args.N, INPUT_DATATYPE, args.vector_size)
@@ -200,30 +179,14 @@ def herd_body(_tx, _ty, _sx, _sy, l3_in, l3_out):
     )
     y_expected = (x_input / rms).astype(INPUT_DATATYPE)
 
-    if args.compile_mode == "compile-and-run":
-        runner = XRTRunner(
-            verbose=args.verbose,
-            omit_while_true_loop=False,
-            output_format=args.output_format,
+    exit(
+        run_on_npu(
+            args,
+            mlir_module,
+            inputs=[x_input],
             instance_name="rms_norm",
-            runtime_loop_tiling_sizes=[4, 4],
+            expected_outputs=[y_expected],
+            rtol=5e-2,
+            atol=5e-1,
         )
-        exit(
-            runner.run_test(
-                mlir_module,
-                inputs=[x_input],
-                expected_outputs=[y_expected],
-                rtol=5e-2,
-                atol=5e-1,
-            )
-        )
-
-    elif args.compile_mode == "compile-only":
-        backend = XRTBackend(
-            verbose=args.verbose,
-            omit_while_true_loop=False,
-            output_format=args.output_format,
-            runtime_loop_tiling_sizes=[4, 4],
-        )
-        module_function = backend.compile(mlir_module)
-        backend.unload()
+    )
diff --git a/programming_examples/rope_lut/rope_lut.py b/programming_examples/rope_lut/rope_lut.py
index 04c90994b..eaa1d481f 100644
--- a/programming_examples/rope_lut/rope_lut.py
+++ b/programming_examples/rope_lut/rope_lut.py
@@ -16,7 +16,6 @@
 Uses a single AIE tile with DMA transfers between L3 and L1 memory.
 """
 
-import argparse
 import numpy as np
 from ml_dtypes import bfloat16
 
@@ -27,8 +26,7 @@
 from air.dialects.memref import AllocOp, DeallocOp
 from air.dialects.func import FuncOp, CallOp
 from air.dialects.scf import for_, yield_
-from air.backend.xrt_runner import XRTRunner, type_mapper
-from air.backend.xrt import XRTBackend
+from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu
 
 range_ = for_
 
@@ -110,30 +108,11 @@ def herd_body(_tx, _ty, _sx, _sy, l3_in, l3_lut, l3_out):
     INPUT_DATATYPE = bfloat16
     THETA = 10000.0
 
-    parser = argparse.ArgumentParser(
-        prog="run.py",
-        description="Builds, runs, and tests the RoPE (LUT-based) example",
-    )
-    parser.add_argument("-v", "--verbose", action="store_true")
-    parser.add_argument("-p", "--print-module-only", action="store_true")
+    parser = make_air_parser("Builds, runs, and tests the RoPE (LUT-based) example")
     parser.add_argument("--seq-len", type=int, default=SEQ_LEN, help="Sequence length")
     parser.add_argument(
         "--embed-dim", type=int, default=EMBED_DIM, help="Embedding dimension"
     )
-    parser.add_argument(
-        "--compile-mode",
-        type=str,
-        choices=["compile-only", "compile-and-run"],
-        dest="compile_mode",
-        default="compile-and-run",
-    )
-    parser.add_argument(
-        "--output-format",
-        type=str,
-        choices=["xclbin", "elf"],
-        default="xclbin",
-        dest="output_format",
-    )
     args = parser.parse_args()
 
     mlir_module = build_module(args.seq_len, args.embed_dim, INPUT_DATATYPE)
@@ -160,44 +139,28 @@ def herd_body(_tx, _ty, _sx, _sy, l3_in, l3_lut, l3_out):
             lut[r, 2 * i + 1] = np.sin(angle)
     lut = lut.astype(INPUT_DATATYPE)
 
-    if args.compile_mode == "compile-and-run":
-        # Compute reference output
-        ref = np.copy(input_data).astype(np.float32)
-        input_f32 = input_data.astype(np.float32)
-        lut_f32 = lut.astype(np.float32)
-        for r in range(seq_len):
-            for i in range(embed_dim // 2):
-                cos_v = lut_f32[r, 2 * i]
-                sin_v = lut_f32[r, 2 * i + 1]
-                x0 = input_f32[r, 2 * i]
-                x1 = input_f32[r, 2 * i + 1]
-                ref[r, 2 * i] = x0 * cos_v - x1 * sin_v
-                ref[r, 2 * i + 1] = x0 * sin_v + x1 * cos_v
-        ref_flat = ref.flatten().astype(INPUT_DATATYPE)
-
-        runner = XRTRunner(
-            verbose=args.verbose,
-            omit_while_true_loop=False,
-            output_format=args.output_format,
+    # Compute reference output
+    ref = np.copy(input_data).astype(np.float32)
+    input_f32 = input_data.astype(np.float32)
+    lut_f32 = lut.astype(np.float32)
+    for r in range(seq_len):
+        for i in range(embed_dim // 2):
+            cos_v = lut_f32[r, 2 * i]
+            sin_v = lut_f32[r, 2 * i + 1]
+            x0 = input_f32[r, 2 * i]
+            x1 = input_f32[r, 2 * i + 1]
+            ref[r, 2 * i] = x0 * cos_v - x1 * sin_v
+            ref[r, 2 * i + 1] = x0 * sin_v + x1 * cos_v
+    ref_flat = ref.flatten().astype(INPUT_DATATYPE)
+
+    exit(
+        run_on_npu(
+            args,
+            mlir_module,
+            inputs=[input_data.flatten(), lut.flatten()],
             instance_name="rope",
-            runtime_loop_tiling_sizes=[4, 4],
-        )
-        exit(
-            runner.run_test(
-                mlir_module,
-                inputs=[input_data.flatten(), lut.flatten()],
-                expected_outputs=[ref_flat],
-                rtol=5e-2,
-                atol=5e-2,
-            )
+            expected_outputs=[ref_flat],
+            rtol=5e-2,
+            atol=5e-2,
         )
-
-    elif args.compile_mode == "compile-only":
-        backend = XRTBackend(
-            verbose=args.verbose,
-            omit_while_true_loop=False,
-            output_format=args.output_format,
-            runtime_loop_tiling_sizes=[4, 4],
-        )
-        module_function = backend.compile(mlir_module)
-        backend.unload()
+    )
diff --git a/programming_examples/rope_sincos/rope_sincos.py b/programming_examples/rope_sincos/rope_sincos.py
index a398a7651..4fcd9d48f 100644
--- a/programming_examples/rope_sincos/rope_sincos.py
+++ b/programming_examples/rope_sincos/rope_sincos.py
@@ -21,7 +21,6 @@
 XFAIL on Peano. See rope_lut/ for a Peano-compatible alternative.
 """
 
-import argparse
 import numpy as np
 from math import cos, sin
 from ml_dtypes import bfloat16
@@ -33,8 +32,7 @@
 from air.dialects.memref import AllocOp, DeallocOp
 from air.dialects.func import FuncOp, CallOp
 from air.dialects.scf import for_, yield_
-from air.backend.xrt_runner import XRTRunner, type_mapper
-from air.backend.xrt import XRTBackend
+from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu
 
 range_ = for_
 
@@ -193,12 +191,9 @@ def herd_body(_tx, _ty, _sx, _sy, _l3_in, _l3_out):
     HERD_N = 4
     INPUT_DATATYPE = bfloat16
 
-    parser = argparse.ArgumentParser(
-        prog="run.py",
-        description="Builds, runs, and tests the RoPE (on-chip sin/cos) example",
+    parser = make_air_parser(
+        "Builds, runs, and tests the RoPE (on-chip sin/cos) example"
     )
-    parser.add_argument("-v", "--verbose", action="store_true")
-    parser.add_argument("-p", "--print-module-only", action="store_true")
     parser.add_argument("--head-size", type=int, default=HEAD_SIZE, help="Head size")
     parser.add_argument(
         "--num-heads", type=int, default=NUM_HEADS, help="Number of heads"
@@ -209,20 +204,6 @@ def herd_body(_tx, _ty, _sx, _sy, _l3_in, _l3_out):
         default=HERD_N,
         help="Number of L1 tiles along the N dimension",
     )
-    parser.add_argument(
-        "--compile-mode",
-        type=str,
-        choices=["compile-only", "compile-and-run"],
-        dest="compile_mode",
-        default="compile-and-run",
-    )
-    parser.add_argument(
-        "--output-format",
-        type=str,
-        choices=["xclbin", "elf"],
-        default="xclbin",
-        dest="output_format",
-    )
     args = parser.parse_args()
 
     mlir_module = build_module(
@@ -258,29 +239,13 @@ def herd_body(_tx, _ty, _sx, _sy, _l3_in, _l3_out):
             outputs[i][s + args.head_size] = v0 * fcr - v1 * fci
             outputs[i][s + args.head_size + 1] = v0 * fci + v1 * fcr
 
-    if args.compile_mode == "compile-and-run":
-        runner = XRTRunner(
-            verbose=args.verbose,
-            omit_while_true_loop=False,
-            output_format=args.output_format,
+    exit(
+        run_on_npu(
+            args,
+            mlir_module,
+            inputs=[inputs],
             instance_name="rope",
-            runtime_loop_tiling_sizes=[4, 4],
+            expected_outputs=[outputs],
+            rtol=1e1,
         )
-        exit(
-            runner.run_test(
-                mlir_module,
-                inputs=[inputs],
-                expected_outputs=[outputs],
-                rtol=1e1,
-            )
-        )
-
-    elif args.compile_mode == "compile-only":
-        backend = XRTBackend(
-            verbose=args.verbose,
-            omit_while_true_loop=False,
-            output_format=args.output_format,
-            runtime_loop_tiling_sizes=[4, 4],
-        )
-        module_function = backend.compile(mlir_module)
-        backend.unload()
+    )
diff --git a/programming_examples/segment_alloc/segment_alloc.py b/programming_examples/segment_alloc/segment_alloc.py
index 3ad90ff6b..3cea52a2a 100644
--- a/programming_examples/segment_alloc/segment_alloc.py
+++ b/programming_examples/segment_alloc/segment_alloc.py
@@ -1,6 +1,5 @@
 # Copyright (C) 2024, Advanced Micro Devices, Inc.
 # SPDX-License-Identifier: MIT
-import argparse
 import numpy as np
 
 from air.ir import *
@@ -8,7 +7,7 @@
 from air.dialects.memref import AllocOp, DeallocOp, load, store
 from air.dialects.func import FuncOp
 from air.dialects.scf import for_, yield_
-from air.backend.xrt_runner import XRTRunner, type_mapper
+from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu
 
 range_ = for_
 
@@ -42,15 +41,8 @@ def launch_body(a, b):
             # The arguments are still the input and the output
             @segment(name="seg", operands=[a, b])
             def segment_body(arg2, arg3):
-                # We want to store our data in L1 memory
-                mem_space_l2 = IntegerAttr.get(T.i32(), MemorySpace.L2)
-
                 # This is the type definition of the tile
-                tile_type_l2 = MemRefType.get(
-                    shape=TILE_SIZE,
-                    element_type=xrt_dtype,
-                    memory_space=mem_space_l2,
-                )
+                tile_type_l2 = l2_memref_type(TILE_SIZE, xrt_dtype)
 
                 # We must allocate a buffer of tile size for the input/output
                 tile_in_l2 = AllocOp(tile_type_l2, [], [])
@@ -60,15 +52,8 @@ def segment_body(arg2, arg3):
                 @herd(name="copyherd", sizes=[1, 1], operands=[arg2, arg3, tile_in_l2])
                 def herd_body(tx, ty, sx, sy, a, b, my_l2_tile):
 
-                    # We want to store our data in L1 memory
-                    mem_space_l1 = IntegerAttr.get(T.i32(), MemorySpace.L1)
-
                     # This is the type definition of the tile
-                    tile_type_l1 = MemRefType.get(
-                        shape=TILE_SIZE,
-                        element_type=xrt_dtype,
-                        memory_space=mem_space_l1,
-                    )
+                    tile_type_l1 = l1_memref_type(TILE_SIZE, xrt_dtype)
 
                     # We must allocate a buffer of tile size for the input/output
                     tile_in_l1 = AllocOp(tile_type_l1, [], [])
@@ -114,29 +99,7 @@ def herd_body(tx, ty, sx, sy, a, b, my_l2_tile):
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        prog="run.py",
-        description="Builds, runs, and tests the segment_alloc example",
-    )
-    parser.add_argument(
-        "-v",
-        "--verbose",
-        action="store_true",
-    )
-    parser.add_argument(
-        "-p",
-        "--print-module-only",
-        action="store_true",
-    )
-    parser.add_argument(
-        "--output-format",
-        type=str,
-        choices=["xclbin", "elf"],
-        default="xclbin",
-        dest="output_format",
-        help="Output format for the compiled binary (default: xclbin)",
-    )
-
+    parser = make_air_parser("Builds, runs, and tests the segment_alloc example")
     args = parser.parse_args()
 
     mlir_module = build_module()
@@ -150,10 +113,11 @@ def herd_body(tx, ty, sx, sy, a, b, my_l2_tile):
         for w in range(TILE_WIDTH):
             output_b[h, w] = input_a[h, w]
 
-    runner = XRTRunner(
-        verbose=args.verbose,
-        output_format=args.output_format,
+    run_on_npu(
+        args,
+        mlir_module,
+        inputs=[input_a],
+        expected_outputs=[output_b],
         instance_name="copy",
         runtime_loop_tiling_sizes=[4, 4],
     )
-    exit(runner.run_test(mlir_module, inputs=[input_a], expected_outputs=[output_b]))
diff --git a/programming_examples/segment_unroll/segment_unroll.py b/programming_examples/segment_unroll/segment_unroll.py
index 7eb5d2311..7d64f8046 100644
--- a/programming_examples/segment_unroll/segment_unroll.py
+++ b/programming_examples/segment_unroll/segment_unroll.py
@@ -13,7 +13,6 @@
 input data using channels indexed by segment coordinates.
 """
 
-import argparse
 import numpy as np
 
 from air.ir import *
@@ -22,7 +21,7 @@
 from air.dialects.func import FuncOp
 from air.dialects.scf import for_, yield_
 from air.dialects import arith
-from air.backend.xrt_runner import XRTRunner, type_mapper
+from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu
 
 range_ = for_
 
@@ -53,12 +52,7 @@ def build_module():
     memrefTyInOut = T.memref(VECTOR_LEN, xrt_dtype)
 
     # L1 memory space for tile data
-    mem_space_l1 = IntegerAttr.get(T.i32(), MemorySpace.L1)
-    image_type_l1 = MemRefType.get(
-        shape=[VECTOR_LEN // SEGMENT_SIZE_X],
-        element_type=xrt_dtype,
-        memory_space=mem_space_l1,
-    )
+    image_type_l1 = l1_memref_type([VECTOR_LEN // SEGMENT_SIZE_X], xrt_dtype)
 
     # Define channels for data movement with dimensions matching segment unroll
     # Each unrolled segment instance needs its own channel endpoint
@@ -131,31 +125,7 @@ def herd_body(tx, ty, sx, sy, herd_seg_x, herd_seg_y):
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        prog="segment_unroll.py",
-        description="Builds, runs, and tests the segment unroll example",
-    )
-    parser.add_argument(
-        "-v",
-        "--verbose",
-        action="store_true",
-        help="Enable verbose output",
-    )
-    parser.add_argument(
-        "-p",
-        "--print-module-only",
-        action="store_true",
-        help="Print the generated MLIR module and exit",
-    )
-    parser.add_argument(
-        "--output-format",
-        type=str,
-        choices=["xclbin", "elf"],
-        default="xclbin",
-        dest="output_format",
-        help="Output format for the compiled binary (default: xclbin)",
-    )
-
+    parser = make_air_parser("Builds, runs, and tests the segment unroll example")
     args = parser.parse_args()
 
     mlir_module = build_module()
@@ -169,10 +139,11 @@ def herd_body(tx, ty, sx, sy, herd_seg_x, herd_seg_y):
     input_a = np.arange(VECTOR_LEN, dtype=INOUT_DATATYPE)
     output_b = input_a + 10
 
-    runner = XRTRunner(
-        verbose=args.verbose,
-        output_format=args.output_format,
+    run_on_npu(
+        args,
+        mlir_module,
+        inputs=[input_a],
+        expected_outputs=[output_b],
         instance_name="segment_unroll_test",
         runtime_loop_tiling_sizes=[4, 4],
     )
-    exit(runner.run_test(mlir_module, inputs=[input_a], expected_outputs=[output_b]))
diff --git a/programming_examples/shared_l1/run.py b/programming_examples/shared_l1/run.py
index c29d6680c..eabd8c446 100644
--- a/programming_examples/shared_l1/run.py
+++ b/programming_examples/shared_l1/run.py
@@ -33,7 +33,7 @@
 from air.dialects.scf import for_, yield_
 from air.dialects.arith import ConstantOp
 from air.dialects.memref import AllocOp
-from air.backend.xrt_runner import XRTRunner
+from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu
 from ml_dtypes import bfloat16
 
 # Constants for buffer sizes
@@ -72,6 +72,13 @@ def parse_args():
         dest="output_format",
         help="Output format for the compiled binary (default: xclbin)",
     )
+    parser.add_argument(
+        "--compile-mode",
+        type=str,
+        choices=["compile-only", "compile-and-run"],
+        dest="compile_mode",
+        default="compile-and-run",
+    )
     args = parser.parse_args()
     return args
 
@@ -402,21 +409,16 @@ def main():
     A = np.random.rand(M_SIZE, N_SIZE).astype(bfloat16)
     C = (A + 3.0).astype(bfloat16)
 
-    # Run the module using XRTRunner
-    runner = XRTRunner(
-        omit_while_true_loop=False,
-        verbose=False,
-        runtime_loop_tiling_sizes=[1, 1],
-        output_format=args.output_format,
-        instance_name="func1",
-        debug_ir=True,
-    )
     exit(
-        runner.run_test(
+        run_on_npu(
+            args,
             mlir_module,
             inputs=[A],
+            instance_name="func1",
             expected_outputs=[C],
             rtol=1e-2,
+            runtime_loop_tiling_sizes=[1, 1],
+            debug_ir=True,
         )
     )
 
diff --git a/programming_examples/shim_dma_2d/run.py b/programming_examples/shim_dma_2d/run.py
index b10217e47..6ff41e880 100644
--- a/programming_examples/shim_dma_2d/run.py
+++ b/programming_examples/shim_dma_2d/run.py
@@ -3,46 +3,35 @@
 # Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
 # SPDX-License-Identifier: MIT
 
-import argparse
 import numpy as np
-from air.backend.xrt_runner import XRTRunner
+from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu
 from shim_dma_2d import *
 
 INOUT_DATATYPE = np.int32
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        prog="run.py",
-        description="Builds, runs, and tests the shim_dma_2d example",
-    )
-    parser.add_argument(
-        "-v",
-        "--verbose",
-        action="store_true",
-    )
-    parser.add_argument(
-        "--output-format",
-        type=str,
-        choices=["xclbin", "elf"],
-        default="xclbin",
-        dest="output_format",
-        help="Output format for the compiled binary (default: xclbin)",
-    )
+    parser = make_air_parser("Builds, runs, and tests the shim_dma_2d example")
     args = parser.parse_args()
 
     mlir_module = build_module()
 
+    if args.print_module_only:
+        print(mlir_module)
+        exit(0)
+
     input_a = np.arange(np.prod(IMAGE_SIZE), dtype=INOUT_DATATYPE).reshape(IMAGE_SIZE)
     output_b = np.zeros(shape=IMAGE_SIZE, dtype=INOUT_DATATYPE)
     for h in range(TILE_HEIGHT):
         for w in range(TILE_WIDTH):
             output_b[h, w] = input_a[h, w]
 
-    runner = XRTRunner(
-        verbose=args.verbose,
-        output_format=args.output_format,
-        instance_name="copy",
-        runtime_loop_tiling_sizes=[4, 4],
+    exit(
+        run_on_npu(
+            args,
+            mlir_module,
+            inputs=[input_a],
+            instance_name="copy",
+            expected_outputs=[output_b],
+        )
     )
-    exit(runner.run_test(mlir_module, inputs=[input_a], expected_outputs=[output_b]))
diff --git a/programming_examples/sigmoid/sigmoid.py b/programming_examples/sigmoid/sigmoid.py
index 54811f1ff..89b5b3948 100644
--- a/programming_examples/sigmoid/sigmoid.py
+++ b/programming_examples/sigmoid/sigmoid.py
@@ -14,21 +14,24 @@
 Computation is vectorized using vector.transfer_read/write.
 """
 
-import argparse
+import os
+import sys
+
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
 import numpy as np
 from ml_dtypes import bfloat16
 
 from air.ir import *
-from air.dialects.affine import apply as affine_apply
 from air.dialects.air import *
 from air.dialects import arith, math as math_dialect
 from air.dialects.arith import ConstantOp
-from air.dialects.memref import AllocOp, DeallocOp, subview
-from air.dialects.vector import transfer_read, transfer_write, BroadcastOp
+from air.dialects.memref import AllocOp, DeallocOp
+from air.dialects.vector import BroadcastOp
 from air.dialects.func import FuncOp
 from air.dialects.scf import for_, yield_
-from air.backend.xrt_runner import XRTRunner, type_mapper
-from air.backend.xrt import XRTBackend
+from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu
+from utils import vec_read, vec_write
 
 range_ = for_
 
@@ -43,14 +46,9 @@ def build_module(n, tile_n, np_dtype_in, vector_size=16):
     index_type = IndexType.get()
 
     l3memrefTy = MemRefType.get([n], xrt_dtype_in)
-    l1MemrefTy = MemRefType.get(
-        shape=[tile_n],
-        element_type=xrt_dtype_in,
-        memory_space=IntegerAttr.get(T.i32(), MemorySpace.L1),
-    )
-
-    vecTy = VectorType.get([VECTOR_SIZE], xrt_dtype_in)
-    identity_map = AffineMapAttr.get(AffineMap.get_identity(1))
+    l1MemrefTy = l1_memref_type([tile_n], xrt_dtype_in)
+    vecTy = vec_type(VECTOR_SIZE, xrt_dtype_in)
+    imap = identity_map_attr()
 
     @FuncOp.from_py_func(l3memrefTy, l3memrefTy)
     def sigmoid(arg0, arg1):
@@ -61,20 +59,7 @@ def herd_body(_tx, _ty, _sx, _sy, _l3_in, _l3_out):
             l1_out = AllocOp(l1MemrefTy, [], [])
 
             for _l_ivx in range_(0, n, tile_n * num_tiles):
-                offset_map = AffineMap.get(
-                    0,
-                    2,
-                    [
-                        AffineExpr.get_add(
-                            AffineSymbolExpr.get(0),
-                            AffineExpr.get_mul(
-                                AffineSymbolExpr.get(1),
-                                AffineConstantExpr.get(tile_n),
-                            ),
-                        )
-                    ],
-                )
-                offset = affine_apply(offset_map, [_l_ivx, _ty])
+                offset = tile_offset_1d(_l_ivx, _ty, tile_n)
 
                 dma_memcpy_nd(
                     l1_in,
@@ -94,10 +79,7 @@ def herd_body(_tx, _ty, _sx, _sy, _l3_in, _l3_out):
                 v_one = BroadcastOp(vecTy, one_const)
 
                 for j in range_(c0, cTileN, cVecSize):
-                    sub_in = subview(l1_in.result, [j], [VECTOR_SIZE], [1])
-                    sub_out = subview(l1_out.result, [j], [VECTOR_SIZE], [1])
-
-                    v_x = transfer_read(vecTy, sub_in, [c0], identity_map, cst0, [True])
+                    v_x = vec_read(l1_in, j, VECTOR_SIZE, c0, vecTy, cst0, imap)
 
                     # sigmoid(x) = 0.5 * (tanh(x/2) + 1)
                     # Uses hardware tanh intrinsic — no exp or division needed.
@@ -106,7 +88,7 @@ def herd_body(_tx, _ty, _sx, _sy, _l3_in, _l3_out):
                     v_tanh_plus_one = arith.addf(v_tanh, v_one.result)
                     v_sigmoid = arith.mulf(v_tanh_plus_one, v_half.result)
 
-                    transfer_write(None, v_sigmoid, sub_out, [c0], identity_map, [True])
+                    vec_write(v_sigmoid, l1_out, j, VECTOR_SIZE, c0, imap)
                     yield_([])
 
                 dma_memcpy_nd(
@@ -126,31 +108,12 @@ def herd_body(_tx, _ty, _sx, _sy, _l3_in, _l3_out):
     TILE_N = 1024
     INPUT_DATATYPE = bfloat16
 
-    parser = argparse.ArgumentParser(
-        prog="run.py",
-        description="Builds, runs, and tests the Sigmoid example",
-    )
-    parser.add_argument("-v", "--verbose", action="store_true")
-    parser.add_argument("-p", "--print-module-only", action="store_true")
+    parser = make_air_parser("Builds, runs, and tests the Sigmoid example")
     parser.add_argument("--n", type=int, default=N, help="Total number of elements")
     parser.add_argument("--tile-n", type=int, default=TILE_N, help="Tile size")
     parser.add_argument(
         "--vector-size", type=int, default=16, help="Vector size for SIMD operations"
     )
-    parser.add_argument(
-        "--compile-mode",
-        type=str,
-        choices=["compile-only", "compile-and-run"],
-        dest="compile_mode",
-        default="compile-and-run",
-    )
-    parser.add_argument(
-        "--output-format",
-        type=str,
-        choices=["xclbin", "elf"],
-        default="xclbin",
-        dest="output_format",
-    )
     args = parser.parse_args()
 
     mlir_module = build_module(args.n, args.tile_n, INPUT_DATATYPE, args.vector_size)
@@ -161,48 +124,30 @@ def herd_body(_tx, _ty, _sx, _sy, _l3_in, _l3_out):
     np.random.seed(0)
     input_a = np.random.uniform(-4.0, 4.0, args.n).astype(INPUT_DATATYPE)
 
-    if args.compile_mode == "compile-and-run":
-        num_samples = 100
-        sampled_indices = np.vstack([np.random.randint(0, args.n, num_samples)])
-
-        # Sigmoid reference using tanh-based identity (matches hardware computation)
-        def sigmoid_ref(x):
-            x_f32 = x.astype(np.float32)
-            return 0.5 * (np.tanh(x_f32 / 2.0) + 1.0)
+    # Sigmoid reference using tanh-based identity (matches hardware computation)
+    def sigmoid_ref(x):
+        x_f32 = x.astype(np.float32)
+        return 0.5 * (np.tanh(x_f32 / 2.0) + 1.0)
 
-        sampled_values = np.array(
-            [sigmoid_ref(input_a[i]) for i in zip(*sampled_indices)],
-            dtype=INPUT_DATATYPE,
-        )
-        sampled_data = {
-            "shape": (args.n,),
-            "indices": sampled_indices,
-            "values": sampled_values,
-        }
-
-        runner = XRTRunner(
-            verbose=args.verbose,
-            omit_while_true_loop=False,
-            output_format=args.output_format,
+    sampled_indices = np.vstack([np.random.randint(0, args.n, 100)])
+    sampled_values = np.array(
+        [sigmoid_ref(input_a[i]) for i in zip(*sampled_indices)],
+        dtype=INPUT_DATATYPE,
+    )
+    sampled_data = {
+        "shape": (args.n,),
+        "indices": sampled_indices,
+        "values": sampled_values,
+    }
+
+    exit(
+        run_on_npu(
+            args,
+            mlir_module,
+            inputs=[input_a],
             instance_name="sigmoid",
-            runtime_loop_tiling_sizes=[4, 4],
+            stochastic_expected_outputs=[sampled_data],
+            rtol=1e-1,
+            atol=5e-2,
         )
-        exit(
-            runner.run_test(
-                mlir_module,
-                inputs=[input_a],
-                stochastic_expected_outputs=[sampled_data],
-                rtol=1e-1,
-                atol=5e-2,
-            )
-        )
-
-    elif args.compile_mode == "compile-only":
-        backend = XRTBackend(
-            verbose=args.verbose,
-            omit_while_true_loop=False,
-            output_format=args.output_format,
-            runtime_loop_tiling_sizes=[4, 4],
-        )
-        module_function = backend.compile(mlir_module)
-        backend.unload()
+    )
diff --git a/programming_examples/silu/silu.py b/programming_examples/silu/silu.py
index cf44bcfe6..f201b8864 100644
--- a/programming_examples/silu/silu.py
+++ b/programming_examples/silu/silu.py
@@ -14,21 +14,24 @@
 Computation is vectorized using vector.transfer_read/write.
 """
 
-import argparse
+import os
+import sys
+
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
 import numpy as np
 from ml_dtypes import bfloat16
 
 from air.ir import *
-from air.dialects.affine import apply as affine_apply
 from air.dialects.air import *
 from air.dialects import arith, math as math_dialect
 from air.dialects.arith import ConstantOp
-from air.dialects.memref import AllocOp, DeallocOp, subview
-from air.dialects.vector import transfer_read, transfer_write, BroadcastOp
+from air.dialects.memref import AllocOp, DeallocOp
+from air.dialects.vector import BroadcastOp
 from air.dialects.func import FuncOp
 from air.dialects.scf import for_, yield_
-from air.backend.xrt_runner import XRTRunner, type_mapper
-from air.backend.xrt import XRTBackend
+from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu
+from utils import vec_read, vec_write
 
 range_ = for_
 
@@ -43,14 +46,9 @@ def build_module(n, tile_n, np_dtype_in, vector_size=16):
     index_type = IndexType.get()
 
     l3memrefTy = MemRefType.get([n], xrt_dtype_in)
-    l1MemrefTy = MemRefType.get(
-        shape=[tile_n],
-        element_type=xrt_dtype_in,
-        memory_space=IntegerAttr.get(T.i32(), MemorySpace.L1),
-    )
-
-    vecTy = VectorType.get([VECTOR_SIZE], xrt_dtype_in)
-    identity_map = AffineMapAttr.get(AffineMap.get_identity(1))
+    l1MemrefTy = l1_memref_type([tile_n], xrt_dtype_in)
+    vecTy = vec_type(VECTOR_SIZE, xrt_dtype_in)
+    imap = identity_map_attr()
 
     @FuncOp.from_py_func(l3memrefTy, l3memrefTy)
     def silu(arg0, arg1):
@@ -61,20 +59,7 @@ def herd_body(_tx, _ty, _sx, _sy, _l3_in, _l3_out):
             l1_out = AllocOp(l1MemrefTy, [], [])
 
             for _l_ivx in range_(0, n, tile_n * num_tiles):
-                offset_map = AffineMap.get(
-                    0,
-                    2,
-                    [
-                        AffineExpr.get_add(
-                            AffineSymbolExpr.get(0),
-                            AffineExpr.get_mul(
-                                AffineSymbolExpr.get(1),
-                                AffineConstantExpr.get(tile_n),
-                            ),
-                        )
-                    ],
-                )
-                offset = affine_apply(offset_map, [_l_ivx, _ty])
+                offset = tile_offset_1d(_l_ivx, _ty, tile_n)
 
                 dma_memcpy_nd(
                     l1_in,
@@ -94,10 +79,7 @@ def herd_body(_tx, _ty, _sx, _sy, _l3_in, _l3_out):
                 v_one = BroadcastOp(vecTy, one_const)
 
                 for j in range_(c0, cTileN, cVecSize):
-                    sub_in = subview(l1_in.result, [j], [VECTOR_SIZE], [1])
-                    sub_out = subview(l1_out.result, [j], [VECTOR_SIZE], [1])
-
-                    v_x = transfer_read(vecTy, sub_in, [c0], identity_map, cst0, [True])
+                    v_x = vec_read(l1_in, j, VECTOR_SIZE, c0, vecTy, cst0, imap)
 
                     # SiLU(x) = x * sigmoid(x)
                     #         = x * 0.5 * (tanh(x/2) + 1)
@@ -108,7 +90,7 @@ def herd_body(_tx, _ty, _sx, _sy, _l3_in, _l3_out):
                     v_sigmoid = arith.mulf(v_tanh_plus_one, v_half.result)
                     v_silu = arith.mulf(v_x, v_sigmoid)
 
-                    transfer_write(None, v_silu, sub_out, [c0], identity_map, [True])
+                    vec_write(v_silu, l1_out, j, VECTOR_SIZE, c0, imap)
                     yield_([])
 
                 dma_memcpy_nd(
@@ -128,31 +110,12 @@ def herd_body(_tx, _ty, _sx, _sy, _l3_in, _l3_out):
     TILE_N = 1024
     INPUT_DATATYPE = bfloat16
 
-    parser = argparse.ArgumentParser(
-        prog="run.py",
-        description="Builds, runs, and tests the SiLU example",
-    )
-    parser.add_argument("-v", "--verbose", action="store_true")
-    parser.add_argument("-p", "--print-module-only", action="store_true")
+    parser = make_air_parser("Builds, runs, and tests the SiLU example")
     parser.add_argument("--n", type=int, default=N, help="Total number of elements")
     parser.add_argument("--tile-n", type=int, default=TILE_N, help="Tile size")
     parser.add_argument(
         "--vector-size", type=int, default=16, help="Vector size for SIMD operations"
     )
-    parser.add_argument(
-        "--compile-mode",
-        type=str,
-        choices=["compile-only", "compile-and-run"],
-        dest="compile_mode",
-        default="compile-and-run",
-    )
-    parser.add_argument(
-        "--output-format",
-        type=str,
-        choices=["xclbin", "elf"],
-        default="xclbin",
-        dest="output_format",
-    )
     args = parser.parse_args()
 
     mlir_module = build_module(args.n, args.tile_n, INPUT_DATATYPE, args.vector_size)
@@ -163,48 +126,30 @@ def herd_body(_tx, _ty, _sx, _sy, _l3_in, _l3_out):
     np.random.seed(0)
     input_a = np.random.uniform(-4.0, 4.0, args.n).astype(INPUT_DATATYPE)
 
-    if args.compile_mode == "compile-and-run":
-        num_samples = 100
-        sampled_indices = np.vstack([np.random.randint(0, args.n, num_samples)])
-
-        # SiLU reference using tanh-based sigmoid (matches hardware computation)
-        def silu_ref(x):
-            x_f32 = x.astype(np.float32)
-            return x_f32 * 0.5 * (np.tanh(x_f32 / 2.0) + 1.0)
+    # SiLU reference using tanh-based sigmoid (matches hardware computation)
+    def silu_ref(x):
+        x_f32 = x.astype(np.float32)
+        return x_f32 * 0.5 * (np.tanh(x_f32 / 2.0) + 1.0)
 
-        sampled_values = np.array(
-            [silu_ref(input_a[i]) for i in zip(*sampled_indices)],
-            dtype=INPUT_DATATYPE,
-        )
-        sampled_data = {
-            "shape": (args.n,),
-            "indices": sampled_indices,
-            "values": sampled_values,
-        }
-
-        runner = XRTRunner(
-            verbose=args.verbose,
-            omit_while_true_loop=False,
-            output_format=args.output_format,
+    sampled_indices = np.vstack([np.random.randint(0, args.n, 100)])
+    sampled_values = np.array(
+        [silu_ref(input_a[i]) for i in zip(*sampled_indices)],
+        dtype=INPUT_DATATYPE,
+    )
+    sampled_data = {
+        "shape": (args.n,),
+        "indices": sampled_indices,
+        "values": sampled_values,
+    }
+
+    exit(
+        run_on_npu(
+            args,
+            mlir_module,
+            inputs=[input_a],
             instance_name="silu",
-            runtime_loop_tiling_sizes=[4, 4],
+            stochastic_expected_outputs=[sampled_data],
+            rtol=1e-1,
+            atol=5e-2,
         )
-        exit(
-            runner.run_test(
-                mlir_module,
-                inputs=[input_a],
-                stochastic_expected_outputs=[sampled_data],
-                rtol=1e-1,
-                atol=5e-2,
-            )
-        )
-
-    elif args.compile_mode == "compile-only":
-        backend = XRTBackend(
-            verbose=args.verbose,
-            omit_while_true_loop=False,
-            output_format=args.output_format,
-            runtime_loop_tiling_sizes=[4, 4],
-        )
-        module_function = backend.compile(mlir_module)
-        backend.unload()
+    )
diff --git a/programming_examples/sine_cosine/sine_cosine.py b/programming_examples/sine_cosine/sine_cosine.py
index 008028bf2..9de60847b 100644
--- a/programming_examples/sine_cosine/sine_cosine.py
+++ b/programming_examples/sine_cosine/sine_cosine.py
@@ -1,6 +1,5 @@
 # Copyright (C) 2025, Advanced Micro Devices, Inc.
 # SPDX-License-Identifier: MIT
-import argparse
 from math import cos, sin, sqrt, exp
 
 from air.ir import *
@@ -10,8 +9,7 @@
 from air.dialects.memref import AllocOp, DeallocOp
 from air.dialects.func import FuncOp, CallOp
 from air.dialects.scf import for_, yield_
-from air.backend.xrt_runner import XRTRunner, type_mapper
-from air.backend.xrt import XRTBackend
+from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu
 from ml_dtypes import bfloat16
 
 import numpy as np
@@ -141,20 +139,7 @@ def herd_body(
     SIN_OR_COS = "sin"
     INPUT_DATATYPE = bfloat16
 
-    parser = argparse.ArgumentParser(
-        prog="run.py",
-        description="Builds, runs, and tests the passthrough_dma example",
-    )
-    parser.add_argument(
-        "-v",
-        "--verbose",
-        action="store_true",
-    )
-    parser.add_argument(
-        "-p",
-        "--print-module-only",
-        action="store_true",
-    )
+    parser = make_air_parser("Builds, runs, and tests the passthrough_dma example")
     parser.add_argument(
         "--n",
         type=int,
@@ -175,22 +160,6 @@ def herd_body(
         choices=["sin", "cos"],
         help="Sine or cosine mode (must be one of [sin, cos])",
     )
-    parser.add_argument(
-        "--compile-mode",
-        type=str,
-        choices=["compile-only", "compile-and-run"],
-        dest="compile_mode",
-        default="compile-and-run",
-        help="Configure to whether to run after compile",
-    )
-    parser.add_argument(
-        "--output-format",
-        type=str,
-        choices=["xclbin", "elf"],
-        default="xclbin",
-        dest="output_format",
-        help="Output format for the compiled binary (default: xclbin)",
-    )
 
     args = parser.parse_args()
 
@@ -217,33 +186,13 @@ def herd_body(
         else:
             raise AssertionError
 
-    if args.compile_mode == "compile-and-run":
-
-        ###### Compile and test
-        runner = XRTRunner(
-            verbose=args.verbose,
-            omit_while_true_loop=False,
-            output_format=args.output_format,
+    exit(
+        run_on_npu(
+            args,
+            mlir_module,
+            inputs=[inputs],
             instance_name="sine_cosine",
-            runtime_loop_tiling_sizes=[4, 4],
-        )
-        exit(
-            runner.run_test(
-                mlir_module,
-                inputs=[inputs],
-                expected_outputs=[outputs],
-                rtol=1e0,
-            )
+            expected_outputs=[outputs],
+            rtol=1e0,
         )
-
-    elif args.compile_mode == "compile-only":
-        ###### Compile only
-        backend = XRTBackend(
-            verbose=args.verbose,
-            omit_while_true_loop=False,
-            output_format=args.output_format,
-            runtime_loop_tiling_sizes=[4, 4],
-        )
-        module_function = backend.compile(mlir_module)
-
-        backend.unload()
+    )
diff --git a/programming_examples/softmax/softmax.py b/programming_examples/softmax/softmax.py
index 24c5daaf0..0383f4d17 100644
--- a/programming_examples/softmax/softmax.py
+++ b/programming_examples/softmax/softmax.py
@@ -1,17 +1,14 @@
 # Copyright (C) 2025, Advanced Micro Devices, Inc.
 # SPDX-License-Identifier: MIT
-import argparse
 from math import cos, sin, sqrt, exp
 
 from air.ir import *
-from air.dialects.affine import apply as affine_apply
 from air.dialects.air import *
 from air.dialects.arith import ConstantOp
 from air.dialects.memref import AllocOp, DeallocOp
 from air.dialects.func import FuncOp, CallOp
 from air.dialects.scf import for_, yield_
-from air.backend.xrt_runner import XRTRunner, type_mapper
-from air.backend.xrt import XRTBackend
+from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu
 from ml_dtypes import bfloat16
 
 import numpy as np
@@ -32,11 +29,7 @@ def build_module(n, tile_n, herd_n, np_dtype_in):
     l3memrefTy = MemRefType.get(a_size, xrt_dtype_in)
 
     # L1 MemRefTypes
-    l1MemrefTy = MemRefType.get(
-        shape=[tile_n],
-        element_type=xrt_dtype_in,
-        memory_space=IntegerAttr.get(T.i32(), MemorySpace.L1),
-    )
+    l1MemrefTy = l1_memref_type([tile_n], xrt_dtype_in)
 
     # Function declaration
     softmax_func = FuncOp(
@@ -68,20 +61,7 @@ def herd_body(
 
             for t in range_(0, n, tile_n * herd_n):
 
-                offset_map = AffineMap.get(
-                    0,
-                    2,
-                    [
-                        AffineExpr.get_add(
-                            AffineSymbolExpr.get(0),
-                            AffineExpr.get_mul(
-                                AffineSymbolExpr.get(1),
-                                AffineConstantExpr.get(tile_n),
-                            ),
-                        )
-                    ],
-                )
-                offset = affine_apply(offset_map, [t, _ty])
+                offset = tile_offset_1d(t, _ty, tile_n)
 
                 dma_memcpy_nd(
                     l1_a_data,
@@ -121,20 +101,7 @@ def herd_body(
     HERD_N = 4
     INPUT_DATATYPE = bfloat16
 
-    parser = argparse.ArgumentParser(
-        prog="run.py",
-        description="Builds, runs, and tests the passthrough_dma example",
-    )
-    parser.add_argument(
-        "-v",
-        "--verbose",
-        action="store_true",
-    )
-    parser.add_argument(
-        "-p",
-        "--print-module-only",
-        action="store_true",
-    )
+    parser = make_air_parser("Builds, runs, and tests the passthrough_dma example")
     parser.add_argument(
         "--n",
         type=int,
@@ -148,22 +115,6 @@ def herd_body(
         default=HERD_N,
         help="Number of L1 tiles along the N dimension",
     )
-    parser.add_argument(
-        "--compile-mode",
-        type=str,
-        choices=["compile-only", "compile-and-run"],
-        dest="compile_mode",
-        default="compile-and-run",
-        help="Configure to whether to run after compile",
-    )
-    parser.add_argument(
-        "--output-format",
-        type=str,
-        choices=["xclbin", "elf"],
-        default="xclbin",
-        dest="output_format",
-        help="Output format for the compiled binary (default: xclbin)",
-    )
 
     args = parser.parse_args()
 
@@ -191,33 +142,13 @@ def herd_body(
         for i in range(args.tile_n):
             outputs[j][i] = outputs[j][i] / sum_val
 
-    if args.compile_mode == "compile-and-run":
-
-        ###### Compile and test
-        runner = XRTRunner(
-            verbose=args.verbose,
-            omit_while_true_loop=False,
-            output_format=args.output_format,
-            instance_name="softmax",
-            runtime_loop_tiling_sizes=[4, 4],
-        )
-        exit(
-            runner.run_test(
-                mlir_module,
-                inputs=[inputs],
-                expected_outputs=[outputs],
-                rtol=1e-1,
-            )
-        )
-
-    elif args.compile_mode == "compile-only":
-        ###### Compile only
-        backend = XRTBackend(
-            verbose=args.verbose,
-            omit_while_true_loop=False,
-            output_format=args.output_format,
-            runtime_loop_tiling_sizes=[4, 4],
-        )
-        module_function = backend.compile(mlir_module)
-
-        backend.unload()
+    run_on_npu(
+        args,
+        mlir_module,
+        inputs=[inputs],
+        expected_outputs=[outputs],
+        instance_name="softmax",
+        omit_while_true_loop=False,
+        runtime_loop_tiling_sizes=[4, 4],
+        rtol=1e-1,
+    )
diff --git a/programming_examples/swiglu/swiglu.py b/programming_examples/swiglu/swiglu.py
index 879b1d261..7e3ef0952 100644
--- a/programming_examples/swiglu/swiglu.py
+++ b/programming_examples/swiglu/swiglu.py
@@ -21,7 +21,6 @@
 Computation is vectorized using vector.transfer_read/write.
 """
 
-import argparse
 import numpy as np
 from ml_dtypes import bfloat16
 
@@ -33,8 +32,7 @@
 from air.dialects.vector import transfer_read, transfer_write, BroadcastOp
 from air.dialects.func import FuncOp
 from air.dialects.scf import for_, yield_
-from air.backend.xrt_runner import XRTRunner, type_mapper
-from air.backend.xrt import XRTBackend
+from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu
 
 range_ = for_
 
@@ -51,21 +49,12 @@ def build_module(n, tile_n, np_dtype_in, vector_size=16):
     # gate and up packed as [2, N]: row 0 = gate, row 1 = up
     l3GateUpTy = MemRefType.get([2, n], xrt_dtype_in)
 
-    l1_mem_space = IntegerAttr.get(T.i32(), MemorySpace.L1)
-    l1MemrefTy = MemRefType.get(
-        shape=[tile_n],
-        element_type=xrt_dtype_in,
-        memory_space=l1_mem_space,
-    )
+    l1MemrefTy = l1_memref_type([tile_n], xrt_dtype_in)
     # L1 buffer for gate+up tile: flat [2*tile_n] for simple 1D indexing
-    l1GateUpTy = MemRefType.get(
-        shape=[2 * tile_n],
-        element_type=xrt_dtype_in,
-        memory_space=l1_mem_space,
-    )
+    l1GateUpTy = l1_memref_type([2 * tile_n], xrt_dtype_in)
 
-    vecTy = VectorType.get([VECTOR_SIZE], xrt_dtype_in)
-    identity_map = AffineMapAttr.get(AffineMap.get_identity(1))
+    vecTy = vec_type(VECTOR_SIZE, xrt_dtype_in)
+    identity_map = identity_map_attr()
 
     @FuncOp.from_py_func(l3memrefTy, l3GateUpTy, l3memrefTy)
     def swiglu(arg0, arg1, arg2):
@@ -175,31 +164,12 @@ def herd_body(
     TILE_N = 1024
     INPUT_DATATYPE = bfloat16
 
-    parser = argparse.ArgumentParser(
-        prog="run.py",
-        description="Builds, runs, and tests the SwiGLU example",
-    )
-    parser.add_argument("-v", "--verbose", action="store_true")
-    parser.add_argument("-p", "--print-module-only", action="store_true")
+    parser = make_air_parser("Builds, runs, and tests the SwiGLU example")
     parser.add_argument("--n", type=int, default=N, help="Total number of elements")
     parser.add_argument("--tile-n", type=int, default=TILE_N, help="Tile size")
     parser.add_argument(
         "--vector-size", type=int, default=16, help="Vector size for SIMD operations"
     )
-    parser.add_argument(
-        "--compile-mode",
-        type=str,
-        choices=["compile-only", "compile-and-run"],
-        dest="compile_mode",
-        default="compile-and-run",
-    )
-    parser.add_argument(
-        "--output-format",
-        type=str,
-        choices=["xclbin", "elf"],
-        default="xclbin",
-        dest="output_format",
-    )
     args = parser.parse_args()
 
     mlir_module = build_module(args.n, args.tile_n, INPUT_DATATYPE, args.vector_size)
@@ -215,55 +185,39 @@ def herd_body(
     # Pack gate and up into [2, N]: row 0 = gate, row 1 = up
     input_gate_up = np.stack([input_gate, input_up]).astype(INPUT_DATATYPE)
 
-    if args.compile_mode == "compile-and-run":
-        num_samples = 100
-        sampled_indices = np.vstack([np.random.randint(0, args.n, num_samples)])
-
-        # SwiGLU reference using tanh-based sigmoid (matches hardware computation)
-        def swiglu_ref(x, gate, up):
-            x_f32 = x.astype(np.float32)
-            g_f32 = gate.astype(np.float32)
-            u_f32 = up.astype(np.float32)
-            xg = x_f32 * g_f32
-            silu_xg = xg * 0.5 * (np.tanh(xg / 2.0) + 1.0)
-            return silu_xg * (x_f32 * u_f32)
-
-        sampled_values = np.array(
-            [
-                swiglu_ref(input_x[i], input_gate[i], input_up[i])
-                for i in zip(*sampled_indices)
-            ],
-            dtype=INPUT_DATATYPE,
-        )
-        sampled_data = {
-            "shape": (args.n,),
-            "indices": sampled_indices,
-            "values": sampled_values,
-        }
-
-        runner = XRTRunner(
-            verbose=args.verbose,
-            omit_while_true_loop=False,
-            output_format=args.output_format,
-            instance_name="swiglu",
-            runtime_loop_tiling_sizes=[4, 4],
-        )
-        exit(
-            runner.run_test(
-                mlir_module,
-                inputs=[input_x, input_gate_up],
-                stochastic_expected_outputs=[sampled_data],
-                rtol=1e-1,
-                atol=5e-2,
-            )
-        )
-
-    elif args.compile_mode == "compile-only":
-        backend = XRTBackend(
-            verbose=args.verbose,
-            omit_while_true_loop=False,
-            output_format=args.output_format,
-            runtime_loop_tiling_sizes=[4, 4],
-        )
-        module_function = backend.compile(mlir_module)
-        backend.unload()
+    num_samples = 100
+    sampled_indices = np.vstack([np.random.randint(0, args.n, num_samples)])
+
+    # SwiGLU reference using tanh-based sigmoid (matches hardware computation)
+    def swiglu_ref(x, gate, up):
+        x_f32 = x.astype(np.float32)
+        g_f32 = gate.astype(np.float32)
+        u_f32 = up.astype(np.float32)
+        xg = x_f32 * g_f32
+        silu_xg = xg * 0.5 * (np.tanh(xg / 2.0) + 1.0)
+        return silu_xg * (x_f32 * u_f32)
+
+    sampled_values = np.array(
+        [
+            swiglu_ref(input_x[i], input_gate[i], input_up[i])
+            for i in zip(*sampled_indices)
+        ],
+        dtype=INPUT_DATATYPE,
+    )
+    sampled_data = {
+        "shape": (args.n,),
+        "indices": sampled_indices,
+        "values": sampled_values,
+    }
+
+    run_on_npu(
+        args,
+        mlir_module,
+        inputs=[input_x, input_gate_up],
+        stochastic_expected_outputs=[sampled_data],
+        instance_name="swiglu",
+        omit_while_true_loop=False,
+        runtime_loop_tiling_sizes=[4, 4],
+        rtol=1e-1,
+        atol=5e-2,
+    )
diff --git a/programming_examples/utils.py b/programming_examples/utils.py
new file mode 100644
index 000000000..a2e5d77d5
--- /dev/null
+++ b/programming_examples/utils.py
@@ -0,0 +1,167 @@
+# Copyright (C) 2026, Advanced Micro Devices, Inc.
+# SPDX-License-Identifier: MIT
+"""Shared helpers for programming_examples.
+
+All helpers are importable from any example directory via:
+    import sys, os; sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
+    from utils import ...
+or, when the example is one level deep (e.g. relu/relu.py):
+    import sys, os; sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
+    from utils import ...
+"""
+
+import argparse
+import numpy as np
+
+from air.ir import *
+from air.dialects.affine import apply as affine_apply
+from air.dialects import arith
+from air.dialects.memref import subview
+from air.dialects.vector import transfer_read, transfer_write
+from air.dialects.air import MemorySpace
+from air.backend.xrt_runner import type_mapper, run_on_npu, make_air_parser
+from air.backend.xrt import compile_air, get_air_runtime, XRTTensor
+from air.extras import types as T
+
+# ---------------------------------------------------------------------------
+# MLIR construct helpers (used inside @module_builder)
+# ---------------------------------------------------------------------------
+
+
+def make_l1_memref(shape, dtype):
+    """MemRefType in L1 (per-core scratchpad) memory space."""
+    return MemRefType.get(
+        shape, dtype, memory_space=IntegerAttr.get(T.i32(), MemorySpace.L1)
+    )
+
+
+def make_l2_memref(shape, dtype):
+    """MemRefType in L2 (segment-shared) memory space."""
+    return MemRefType.get(
+        shape, dtype, memory_space=IntegerAttr.get(T.i32(), MemorySpace.L2)
+    )
+
+
+def make_vec_type(size, dtype):
+    """1D VectorType of given length and element type."""
+    return VectorType.get([size], dtype)
+
+
+def identity_map_1d():
+    """1D identity AffineMapAttr — the standard transfer_read/write permutation map."""
+    return AffineMapAttr.get(AffineMap.get_identity(1))
+
+
+def tiled_1d_offset(loop_var, tile_idx, tile_n):
+    """
+    Compute offset = loop_var + tile_idx * tile_n via affine_apply.
+
+    Replaces the 12-line AffineMap.get / AffineExpr chain used in every
+    1D vectorized example with a 1x2 herd.
+
+    Args:
+        loop_var:  outer loop induction variable (SSA Value or int)
+        tile_idx:  herd tile index, e.g. _ty            (SSA Value or int)
+        tile_n:    tile size in elements                 (Python int)
+    """
+    offset_map = AffineMap.get(
+        0,
+        2,
+        [
+            AffineExpr.get_add(
+                AffineSymbolExpr.get(0),
+                AffineExpr.get_mul(
+                    AffineSymbolExpr.get(1),
+                    AffineConstantExpr.get(tile_n),
+                ),
+            )
+        ],
+    )
+    return affine_apply(offset_map, [loop_var, tile_idx])
+
+
+def vec_read(buf, j, vec_size, c0, vec_ty, cst0, imap):
+    """subview + transfer_read with the standard fixed call signature."""
+    result = buf.result if hasattr(buf, "result") else buf
+    sub = subview(result, [j], [vec_size], [1])
+    return transfer_read(vec_ty, sub, [c0], imap, cst0, [True])
+
+
+def vec_write(val, buf, j, vec_size, c0, imap):
+    """subview + transfer_write with the standard fixed call signature."""
+    result = buf.result if hasattr(buf, "result") else buf
+    sub = subview(result, [j], [vec_size], [1])
+    transfer_write(None, val, sub, [c0], imap, [True])
+
+
+# ---------------------------------------------------------------------------
+# Argument-parser factory
+# ---------------------------------------------------------------------------
+
+
+def make_air_parser(description, prog="run.py"):
+    """
+    Return an ArgumentParser pre-populated with the 4 universal flags:
+        -v / --verbose
+        -p / --print-module-only
+        --compile-mode  {compile-only, compile-and-run}
+        --output-format {xclbin, elf}
+
+    The caller adds example-specific arguments (--n, --tile-n, etc.) after.
+    """
+    p = argparse.ArgumentParser(prog=prog, description=description)
+    p.add_argument("-v", "--verbose", action="store_true")
+    p.add_argument("-p", "--print-module-only", action="store_true")
+    p.add_argument(
+        "--compile-mode",
+        type=str,
+        choices=["compile-only", "compile-and-run"],
+        dest="compile_mode",
+        default="compile-and-run",
+    )
+    p.add_argument(
+        "--output-format",
+        type=str,
+        choices=["xclbin", "elf"],
+        default="xclbin",
+        dest="output_format",
+    )
+    return p
+
+
+# ---------------------------------------------------------------------------
+# Stochastic sampling helper
+# ---------------------------------------------------------------------------
+
+
+def stochastic_check(inputs, n, ref_fn, dtype, num_samples=100):
+    """
+    Build the stochastic_expected_outputs dict for 1D element-wise ops.
+
+    Args:
+        inputs:      list of numpy input arrays (same as passed to run_test)
+        n:           total element count
+        ref_fn:      scalar reference function, called as ref_fn(*scalars)
+        dtype:       output numpy dtype
+        num_samples: number of randomly sampled indices
+    Returns:
+        dict with "shape", "indices", "values" for run_on_npu() stochastic verification
+    """
+    sampled_indices = np.vstack([np.random.randint(0, n, num_samples)])
+    sampled_values = np.array(
+        [ref_fn(*[inp[i] for inp in inputs]) for i in zip(*sampled_indices)],
+        dtype=dtype,
+    )
+    return {"shape": (n,), "indices": sampled_indices, "values": sampled_values}
+
+
+# ---------------------------------------------------------------------------
+# Print-module-only convenience
+# ---------------------------------------------------------------------------
+
+
+def check_print_module(mlir_module, args):
+    """Print the MLIR module and exit if --print-module-only was passed."""
+    if args.print_module_only:
+        print(mlir_module)
+        exit(0)
diff --git a/programming_examples/vector_matrix_multiplication/bf16/single_core/single_core.py b/programming_examples/vector_matrix_multiplication/bf16/single_core/single_core.py
index 5914a656e..41a3da919 100644
--- a/programming_examples/vector_matrix_multiplication/bf16/single_core/single_core.py
+++ b/programming_examples/vector_matrix_multiplication/bf16/single_core/single_core.py
@@ -1,6 +1,5 @@
 # Copyright (C) 2025, Advanced Micro Devices, Inc.
 # SPDX-License-Identifier: MIT
-import argparse
 from ml_dtypes import bfloat16
 
 from air.ir import *
@@ -10,7 +9,7 @@
 from air.dialects.memref import AllocOp, DeallocOp, load, store
 from air.dialects.func import FuncOp, CallOp
 from air.dialects.scf import for_, yield_
-from air.backend.xrt_runner import XRTRunner, type_mapper
+from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu
 
 range_ = for_
 
@@ -272,20 +271,7 @@ def herd_body(_tx, _ty, _sx, _sy):
     INPUT_DATATYPE = bfloat16
     OUTPUT_DATATYPE = bfloat16
 
-    parser = argparse.ArgumentParser(
-        prog="run.py",
-        description="Builds, runs, and tests the passthrough_dma example",
-    )
-    parser.add_argument(
-        "-v",
-        "--verbose",
-        action="store_true",
-    )
-    parser.add_argument(
-        "-p",
-        "--print-module-only",
-        action="store_true",
-    )
+    parser = make_air_parser("Builds, runs, and tests the passthrough_dma example")
     parser.add_argument(
         "--k", type=int, default=K, help="K dimension size in a (1xK) * (KxN) matmul"
     )
@@ -301,14 +287,6 @@ def herd_body(_tx, _ty, _sx, _sy):
     parser.add_argument(
         "--tile-n", type=int, default=TILE_N, help="N dimension size of each L1 tile"
     )
-    parser.add_argument(
-        "--output-format",
-        type=str,
-        choices=["xclbin", "elf"],
-        default="xclbin",
-        dest="output_format",
-        help="Output format for the compiled binary (default: xclbin)",
-    )
 
     args = parser.parse_args()
 
@@ -330,17 +308,12 @@ def herd_body(_tx, _ty, _sx, _sy):
     )
     output_c = np.dot(input_a.astype(OUTPUT_DATATYPE), input_b.astype(OUTPUT_DATATYPE))
 
-    runner = XRTRunner(
-        verbose=args.verbose,
-        omit_while_true_loop=False,
-        output_format=args.output_format,
-        instance_name="vecmat_bf16",
-        runtime_loop_tiling_sizes=[4, 4],
-    )
     exit(
-        runner.run_test(
+        run_on_npu(
+            args,
             mlir_module,
             inputs=[input_a, input_b],
+            instance_name="vecmat_bf16",
             expected_outputs=[output_c],
             rtol=0.04,
         )
diff --git a/programming_examples/vector_matrix_multiplication/block_quantized_i8/single_core/single_core.py b/programming_examples/vector_matrix_multiplication/block_quantized_i8/single_core/single_core.py
index 79d89f8bb..e8f60b221 100644
--- a/programming_examples/vector_matrix_multiplication/block_quantized_i8/single_core/single_core.py
+++ b/programming_examples/vector_matrix_multiplication/block_quantized_i8/single_core/single_core.py
@@ -1,7 +1,5 @@
 # Copyright (C) 2024, Advanced Micro Devices, Inc.
 # SPDX-License-Identifier: MIT
-import argparse
-
 from air.ir import *
 from air.dialects.affine import apply as affine_apply
 from air.dialects.air import *
@@ -9,7 +7,7 @@
 from air.dialects.memref import AllocOp, DeallocOp, load, store
 from air.dialects.func import FuncOp, CallOp
 from air.dialects.scf import for_, yield_
-from air.backend.xrt_runner import XRTRunner, type_mapper
+from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu
 
 import numpy as np
 
@@ -404,20 +402,7 @@ def herd_body(_tx, _ty, _sx, _sy):
     ACC_DATATYPE = np.int32
     OUTPUT_DATATYPE = np.float32
 
-    parser = argparse.ArgumentParser(
-        prog="run.py",
-        description="Builds, runs, and tests the passthrough_dma example",
-    )
-    parser.add_argument(
-        "-v",
-        "--verbose",
-        action="store_true",
-    )
-    parser.add_argument(
-        "-p",
-        "--print-module-only",
-        action="store_true",
-    )
+    parser = make_air_parser("Builds, runs, and tests the passthrough_dma example")
     parser.add_argument(
         "--k", type=int, default=K, help="K dimension size in a (1xK) * (KxN) matmul"
     )
@@ -439,14 +424,6 @@ def herd_body(_tx, _ty, _sx, _sy):
     parser.add_argument(
         "--tile-n", type=int, default=TILE_N, help="N dimension size of each L1 tile"
     )
-    parser.add_argument(
-        "--output-format",
-        type=str,
-        choices=["xclbin", "elf"],
-        default="xclbin",
-        dest="output_format",
-        help="Output format for the compiled binary (default: xclbin)",
-    )
 
     args = parser.parse_args()
 
@@ -499,17 +476,12 @@ def herd_body(_tx, _ty, _sx, _sy):
             )
             ival = 0
 
-    runner = XRTRunner(
-        verbose=args.verbose,
-        omit_while_true_loop=False,
-        output_format=args.output_format,
-        instance_name="vecmat_i8",
-        runtime_loop_tiling_sizes=[4, 4],
-    )
     exit(
-        runner.run_test(
+        run_on_npu(
+            args,
             mlir_module,
             inputs=[input_a, input_a_s, input_b, input_b_s],
+            instance_name="vecmat_i8",
             expected_outputs=[output_c],
         )
     )
diff --git a/programming_examples/vector_matrix_multiplication/i8/single_core/single_core.py b/programming_examples/vector_matrix_multiplication/i8/single_core/single_core.py
index 7dab0af7a..154587707 100644
--- a/programming_examples/vector_matrix_multiplication/i8/single_core/single_core.py
+++ b/programming_examples/vector_matrix_multiplication/i8/single_core/single_core.py
@@ -1,7 +1,5 @@
 # Copyright (C) 2025, Advanced Micro Devices, Inc.
 # SPDX-License-Identifier: MIT
-import argparse
-
 from air.ir import *
 from air.dialects.affine import apply as affine_apply
 from air.dialects.air import *
@@ -9,7 +7,7 @@
 from air.dialects.memref import AllocOp, DeallocOp, load, store
 from air.dialects.func import FuncOp, CallOp
 from air.dialects.scf import for_, yield_
-from air.backend.xrt_runner import XRTRunner, type_mapper
+from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu
 
 import numpy as np
 
@@ -289,20 +287,7 @@ def herd_body(_tx, _ty, _sx, _sy):
     INPUT_DATATYPE = np.int8
     OUTPUT_DATATYPE = np.int32
 
-    parser = argparse.ArgumentParser(
-        prog="run.py",
-        description="Builds, runs, and tests the passthrough_dma example",
-    )
-    parser.add_argument(
-        "-v",
-        "--verbose",
-        action="store_true",
-    )
-    parser.add_argument(
-        "-p",
-        "--print-module-only",
-        action="store_true",
-    )
+    parser = make_air_parser("Builds, runs, and tests the passthrough_dma example")
     parser.add_argument(
         "--k", type=int, default=K, help="K dimension size in a (1xK) * (KxN) matmul"
     )
@@ -318,14 +303,6 @@ def herd_body(_tx, _ty, _sx, _sy):
     parser.add_argument(
         "--tile-n", type=int, default=TILE_N, help="N dimension size of each L1 tile"
     )
-    parser.add_argument(
-        "--output-format",
-        type=str,
-        choices=["xclbin", "elf"],
-        default="xclbin",
-        dest="output_format",
-        help="Output format for the compiled binary (default: xclbin)",
-    )
 
     args = parser.parse_args()
 
@@ -357,17 +334,12 @@ def herd_body(_tx, _ty, _sx, _sy):
     input_b = input_b.astype(INPUT_DATATYPE)
     output_c = np.dot(input_a.astype(OUTPUT_DATATYPE), input_b.astype(OUTPUT_DATATYPE))
 
-    runner = XRTRunner(
-        verbose=args.verbose,
-        omit_while_true_loop=False,
-        output_format=args.output_format,
-        instance_name="vecmat_i8",
-        runtime_loop_tiling_sizes=[4, 4],
-    )
     exit(
-        runner.run_test(
+        run_on_npu(
+            args,
             mlir_module,
             inputs=[input_a, input_b],
+            instance_name="vecmat_i8",
             expected_outputs=[output_c],
         )
     )
diff --git a/programming_examples/weighted_rms_norm/weighted_rms_norm.py b/programming_examples/weighted_rms_norm/weighted_rms_norm.py
index 9a15be31c..8c6c9e6e3 100644
--- a/programming_examples/weighted_rms_norm/weighted_rms_norm.py
+++ b/programming_examples/weighted_rms_norm/weighted_rms_norm.py
@@ -15,7 +15,6 @@
 configurable VECTOR_SIZE (default 16 for AIE2).
 """
 
-import argparse
 import numpy as np
 from ml_dtypes import bfloat16
 
@@ -31,8 +30,7 @@
 )
 from air.dialects.func import FuncOp
 from air.dialects.scf import for_, yield_
-from air.backend.xrt_runner import XRTRunner, type_mapper
-from air.backend.xrt import XRTBackend
+from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu
 
 range_ = for_
 
@@ -46,17 +44,16 @@ def build_module(M, N, np_dtype, vector_size=16):
         N % vector_size == 0
     ), f"N ({N}) must be divisible by vector_size ({vector_size})"
 
-    vecTy = VectorType.get([vector_size], xrt_dtype)
-    identity_map = AffineMapAttr.get(AffineMap.get_identity(1))
+    vecTy = vec_type(vector_size, xrt_dtype)
+    identity_map = identity_map_attr()
 
     # L3 types
     l3MemrefTy = MemRefType.get([M, N], xrt_dtype)
     l3WeightTy = MemRefType.get([N], xrt_dtype)
 
     # L1 types
-    l1_mem_space = IntegerAttr.get(T.i32(), MemorySpace.L1)
-    l1RowTy = MemRefType.get([N], xrt_dtype, memory_space=l1_mem_space)
-    l1VecTy = MemRefType.get([vector_size], xrt_dtype, memory_space=l1_mem_space)
+    l1RowTy = l1_memref_type([N], xrt_dtype)
+    l1VecTy = l1_memref_type([vector_size], xrt_dtype)
 
     @FuncOp.from_py_func(l3MemrefTy, l3WeightTy, l3MemrefTy)
     def weighted_rms_norm(arg0, arg1, arg2):
@@ -161,12 +158,9 @@ def herd_body(_tx, _ty, _sx, _sy, l3_in, l3_weight, l3_out):
     VECTOR_SIZE = 16
     INPUT_DATATYPE = bfloat16
 
-    parser = argparse.ArgumentParser(
-        prog="run.py",
-        description="Builds, runs, and tests the weighted RMS normalization example",
+    parser = make_air_parser(
+        "Builds, runs, and tests the weighted RMS normalization example"
     )
-    parser.add_argument("-v", "--verbose", action="store_true")
-    parser.add_argument("-p", "--print-module-only", action="store_true")
     parser.add_argument("--M", type=int, default=M_DEFAULT, help="M dimension (rows)")
     parser.add_argument("--N", type=int, default=N_DEFAULT, help="N dimension (cols)")
     parser.add_argument(
@@ -175,20 +169,6 @@ def herd_body(_tx, _ty, _sx, _sy, l3_in, l3_weight, l3_out):
         default=VECTOR_SIZE,
         help="Vector size for SIMD operations",
     )
-    parser.add_argument(
-        "--compile-mode",
-        type=str,
-        choices=["compile-only", "compile-and-run"],
-        dest="compile_mode",
-        default="compile-and-run",
-    )
-    parser.add_argument(
-        "--output-format",
-        type=str,
-        choices=["xclbin", "elf"],
-        default="xclbin",
-        dest="output_format",
-    )
     args = parser.parse_args()
 
     mlir_module = build_module(args.M, args.N, INPUT_DATATYPE, args.vector_size)
@@ -209,30 +189,14 @@ def herd_body(_tx, _ty, _sx, _sy, l3_in, l3_weight, l3_out):
         (x_input.astype(np.float32) / rms) * weight.astype(np.float32)
     ).astype(INPUT_DATATYPE)
 
-    if args.compile_mode == "compile-and-run":
-        runner = XRTRunner(
-            verbose=args.verbose,
-            omit_while_true_loop=False,
-            output_format=args.output_format,
+    exit(
+        run_on_npu(
+            args,
+            mlir_module,
+            inputs=[x_input, weight],
             instance_name="weighted_rms_norm",
-            runtime_loop_tiling_sizes=[4, 4],
+            expected_outputs=[y_expected],
+            rtol=5e-2,
+            atol=5e-1,
         )
-        exit(
-            runner.run_test(
-                mlir_module,
-                inputs=[x_input, weight],
-                expected_outputs=[y_expected],
-                rtol=5e-2,
-                atol=5e-1,
-            )
-        )
-
-    elif args.compile_mode == "compile-only":
-        backend = XRTBackend(
-            verbose=args.verbose,
-            omit_while_true_loop=False,
-            output_format=args.output_format,
-            runtime_loop_tiling_sizes=[4, 4],
-        )
-        module_function = backend.compile(mlir_module)
-        backend.unload()
+    )
diff --git a/python/air/backend/xrt.py b/python/air/backend/xrt.py
index 259205361..9b3cbe68f 100644
--- a/python/air/backend/xrt.py
+++ b/python/air/backend/xrt.py
@@ -3,15 +3,33 @@
 # Copyright (C) 2024, Advanced Micro Devices, Inc.
 # SPDX-License-Identifier: MIT
 
+"""
+XRT backend for mlir-air.
+
+Public API
+----------
+compile_air(air_module, ...)  -> NPUKernel
+    Compile an AIR dialect MLIR module to an NPU kernel artifact.
+
+AirRuntime                   (CachedXRTRuntime subclass)
+    Richer verify_results() with rtol/atol, stochastic sampling,
+    mismatch budget, and Pearson correlation.
+
+get_air_runtime() -> AirRuntime
+    Return the process-level singleton AirRuntime.
+
+XRTTensor                    (re-exported from aie.utils)
+    Numpy-backed buffer object for XRT.
+"""
+
 import air.ir
 import air.passmanager
 
-from .abc import AirBackend, AirBackendError
+from .abc import AirBackendError
 
 import air.compiler.util
 
 # Register the AIR dialect so air.ir.Context() can parse AIR ops.
-# This was previously done as a side effect of importing aircc.main.
 from air.dialects import air as _air_dialect  # noqa: F401
 
 import numpy as np
@@ -21,31 +39,653 @@
 
 from ml_dtypes import bfloat16
 
+# ---------------------------------------------------------------------------
+# mlir-aie runtime imports
+# ---------------------------------------------------------------------------
+try:
+    import aie.utils as _aie_utils
+    from aie.utils import CachedXRTRuntime, NPUKernel
+    from aie.utils.hostruntime.xrtruntime.tensor import XRTTensor
+
+    _HAS_AIE_RUNTIME = True
+    # Factory function that selects XRTTensor or CPUOnlyTensor depending on
+    # whether pyxrt is importable. Used throughout instead of XRTTensor() directly.
+    _tensor = _aie_utils.tensor
+except ImportError:
+    _HAS_AIE_RUNTIME = False
+    CachedXRTRuntime = object  # fallback base so class definition succeeds
+    NPUKernel = None
+    XRTTensor = None
+    _tensor = None
+
+
+# ---------------------------------------------------------------------------
+# compile_air() — replaces XRTBackend.compile()
+# ---------------------------------------------------------------------------
+
+
+def compile_air(
+    air_module: air.ir.Module,
+    *,
+    verbose: bool = False,
+    target_device: str = None,
+    omit_while_true_loop: bool = False,
+    omit_pingpong: str = "",
+    lower_linalg_to_func=None,
+    air_loop_fusion: bool = False,
+    runtime_loop_tiling_sizes=None,
+    omit_auto_broadcast: bool = False,
+    channel_multiplexing=None,
+    use_lock_race_condition_fix: bool = False,
+    trace_offset: int = 0,
+    trace_size: int = 0,
+    output_format: str = "xclbin",
+    xclbin_kernel_name: str = "",
+    instance_name: str = "",
+    kernel_id: str = "",
+    xclbin_input: str = "",
+    num_device_cols: int = 0,
+    debug_ir: bool = False,
+    bf16_emulation: bool = False,
+    # Legacy aliases kept for backward compat
+    kernel_name: str = "",
+    output_binary_name: str = "air",
+    insts: str = "air.insts.bin",
+):
+    """Compile an AIR dialect MLIR module to an NPUKernel artifact.
+
+    Replaces ``XRTBackend(...).compile(air_module)``.
+
+    Args:
+        air_module: The MLIR module in AIR dialect.
+        verbose: Verbose output.
+        target_device: Explicit target device ("npu1", "npu2", etc.).
+            If None, auto-detect via xrt-smi.
+        omit_while_true_loop: Omit the while-true loop in generated code.
+        omit_pingpong: Omit ping-pong buffering for given memory level.
+            Values: "", "L1", "L2", "all".
+        lower_linalg_to_func: Lower linalg.generic to function calls.
+        air_loop_fusion: Enable air-loop-fusion pass.
+        runtime_loop_tiling_sizes: Extra runtime loop tiling sizes.
+        omit_auto_broadcast: Omit automatic broadcast detection.
+        channel_multiplexing: Air channel multiplexing memory spaces.
+        use_lock_race_condition_fix: Enable lock race condition fix.
+        trace_offset: Trace output offset (bytes).
+        trace_size: Trace output size (bytes).
+        output_format: Output binary format: "xclbin", "elf", or "txn".
+        xclbin_kernel_name: Kernel name embedded in xclbin metadata.
+        instance_name: Instance name embedded in xclbin metadata.
+        kernel_id: Kernel ID embedded in xclbin file.
+        xclbin_input: Existing xclbin to embed the new kernel into.
+        num_device_cols: Device columns to constrain the design to (0=all).
+        debug_ir: Save IR after each pass to debug_ir/ directory.
+        bf16_emulation: Emulate f32 vector arithmetic with bf16.
+        kernel_name: Legacy alias for xclbin_kernel_name.
+        output_binary_name: Base name for the output binary (without extension).
+        insts: Instruction filename (for xclbin format).
+
+    Returns:
+        NPUKernel: Compiled kernel artifact with xclbin/insts paths.
+    """
+    if runtime_loop_tiling_sizes is None:
+        runtime_loop_tiling_sizes = []
+    if channel_multiplexing is None:
+        channel_multiplexing = []
+
+    # Support legacy kernel_name alias
+    effective_kernel_name = xclbin_kernel_name or kernel_name
+
+    # Support backward compatibility: convert bool omit_pingpong
+    if isinstance(omit_pingpong, bool):
+        omit_pingpong = "all" if omit_pingpong else ""
+
+    # Determine target device
+    if target_device is not None:
+        if verbose:
+            print(f"Using explicitly specified target device: {target_device}")
+    else:
+        target_device = "npu1"  # default fallback
+        try:
+            import re
 
-class XRTCompileArtifact:
-    """A class encompassing information on the artifacts produced by compilation for the NPU/XRT"""
+            xrtsmi = "/opt/xilinx/xrt/bin/xrt-smi"
+            result = subprocess.run(
+                [xrtsmi, "examine"], stdout=subprocess.PIPE, stderr=subprocess.PIPE
+            )
+            result_lines = result.stdout.decode("utf-8").split("\n")
+            p = re.compile(r"[\|]?(\[.+:.+:.+\]).+\|(RyzenAI-(npu\d)|NPU (\w+))\W*\|")
+            for line in result_lines:
+                m = p.match(line)
+                if not m:
+                    continue
+                if verbose:
+                    print("Found Ryzen AI device:", m.group(1))
+                model = "unknown"
+                if m.group(3):
+                    model = str(m.group(3))
+                if m.group(4):
+                    model = str(m.group(4))
+                if verbose:
+                    print(f"\tmodel: '{model}'")
+                if model in ["npu1", "Phoenix"]:
+                    target_device = "npu1"
+                elif model in ["npu4", "Strix"]:
+                    target_device = "npu2"
+                else:
+                    print(f"WARNING: xrt-smi reported unknown NPU model '{model}'.")
+                break
+        except Exception as e:
+            if verbose:
+                print("Failed to run xrt-smi, using default target device")
+                print(e)
+
+    # Validate ELF format compatibility
+    if output_format == "elf" and "npu1" in target_device:
+        raise AirBackendError(
+            f"output_format='elf' is not supported for {target_device} target. "
+            "ELF output format is only supported on npu2 and later devices."
+        )
+
+    # Apply column configuration
+    if num_device_cols > 0:
+        max_cols = 4 if target_device == "npu1" else 8
+        if num_device_cols > max_cols - 1:
+            raise AirBackendError(
+                f"Invalid num_device_cols value: {num_device_cols}. "
+                f"For {target_device}, valid values are 0 (entire device) or 1-{max_cols-1}"
+            )
+        base_device = target_device
+        target_device = f"{target_device}_{num_device_cols}col"
+        if verbose:
+            print(
+                f"Confining design to {num_device_cols} column(s) of {base_device} device: {target_device}"
+            )
 
-    def __init__(
+    # Determine peano toolchain
+    peano_package_dir = os.environ.get("PEANO_INSTALL_DIR", "")
+    if peano_package_dir and os.path.isdir(peano_package_dir):
+        print(
+            "compile_air: llvm-aie package detected via PEANO_INSTALL_DIR:",
+            peano_package_dir,
+        )
+
+    # Determine output binary file name
+    if output_format == "elf":
+        output_binary = f"{output_binary_name}.elf"
+    elif output_format == "txn":
+        output_binary = f"{output_binary_name}.txn"
+    else:  # xclbin (default)
+        output_binary = f"{output_binary_name}.xclbin"
+
+    with air.ir.Context():
+        if verbose:
+            print("AIR Module:")
+            print(air_module)
+
+        aircc_options = [
+            "--device",
+            target_device,
+            "air.mlir",
+        ]
+
+        # Output file options
+        if output_format == "elf":
+            aircc_options += ["--elf-name", output_binary]
+        else:
+            aircc_options += ["-o", output_binary]
+            aircc_options += ["-i", insts]
+
+        for s in runtime_loop_tiling_sizes:
+            aircc_options += [f"--air-runtime-loop-tiling-sizes={s}"]
+
+        if verbose:
+            aircc_options = aircc_options + ["-v"]
+
+        if omit_while_true_loop:
+            aircc_options += ["--omit-while-true-loop"]
+
+        if omit_pingpong:
+            pp_val = "all" if omit_pingpong is True else str(omit_pingpong)
+            aircc_options += [f"--omit-ping-pong-transform={pp_val}"]
+
+        if lower_linalg_to_func:
+            aircc_options += ["--lower-linalg-to-func"]
+            aircc_options += [lower_linalg_to_func]
+
+        if air_loop_fusion:
+            aircc_options += ["--air-loop-fusion"]
+
+        if omit_auto_broadcast:
+            aircc_options += ["--omit-auto-broadcast"]
+
+        if len(channel_multiplexing) != 0:
+            for ch in channel_multiplexing:
+                aircc_options += [f"--air-channel-multiplexing={ch}"]
+
+        if use_lock_race_condition_fix:
+            aircc_options += ["--use-lock-race-condition-fix"]
+
+        if trace_size != 0:
+            aircc_options += ["-trace-size"]
+            aircc_options += [str(trace_size)]
+            aircc_options += ["-trace-offset"]
+            aircc_options += [str(trace_offset)]
+
+        if output_format != "":
+            aircc_options += ["--output-format"]
+            aircc_options += [output_format]
+        if effective_kernel_name != "":
+            aircc_options += ["--xclbin-kernel-name"]
+            aircc_options += [effective_kernel_name]
+        if instance_name != "":
+            aircc_options += ["--xclbin-instance-name"]
+            aircc_options += [instance_name]
+        if kernel_id != "":
+            aircc_options += ["--xclbin-kernel-id"]
+            aircc_options += [kernel_id]
+        if xclbin_input != "":
+            aircc_options += ["--xclbin-input"]
+            aircc_options += [xclbin_input]
+
+        if peano_package_dir != "":
+            aircc_options += ["--peano"]
+            aircc_options += [peano_package_dir]
+            aircc_options += ["--no-xchesscc"]
+            aircc_options += ["--no-xbridge"]
+        else:
+            aircc_options += ["--xchesscc"]
+            aircc_options += ["--xbridge"]
+
+        if debug_ir:
+            aircc_options += ["--debug-ir"]
+
+        if bf16_emulation:
+            aircc_options += ["--bf16-emulation"]
+
+        if verbose:
+            print("Running aircc with options:", " ".join(aircc_options))
+
+        # Write module to disk for aircc
+        with open("air.mlir", "w") as f:
+            f.write(str(air_module))
+
+        # Invoke aircc
+        aircc_exe = shutil.which("aircc")
+        if not aircc_exe:
+            raise AirBackendError(
+                "aircc binary not found in PATH. "
+                "Ensure mlir-air is installed and aircc is on PATH."
+            )
+        result = subprocess.run(
+            [aircc_exe] + aircc_options,
+            capture_output=True,
+            text=True,
+        )
+        if result.returncode != 0:
+            error_msg = result.stderr if result.stderr else result.stdout
+            raise AirBackendError(f"aircc compilation failed:\n{error_msg}")
+
+    # Build kernel_name for NPUKernel
+    if output_format == "elf" and instance_name != "":
+        npu_kernel_name = f"main:{instance_name}"
+    else:
+        npu_kernel_name = effective_kernel_name if effective_kernel_name else "MLIR_AIE"
+
+    if _HAS_AIE_RUNTIME:
+        return NPUKernel(output_binary, insts, kernel_name=npu_kernel_name)
+    else:
+        # Fallback: return a simple namespace when aie.utils is unavailable
+        import types
+
+        kernel = types.SimpleNamespace(
+            xclbin_path=output_binary,
+            insts_path=insts,
+            kernel_name=npu_kernel_name,
+        )
+        return kernel
+
+
+# ---------------------------------------------------------------------------
+# AirRuntime — CachedXRTRuntime with mlir-air's richer verification
+# ---------------------------------------------------------------------------
+
+
+class AirRuntime(CachedXRTRuntime):
+    """
+    mlir-aie's CachedXRTRuntime extended with mlir-air's richer verification.
+
+    Inherits: device open, xclbin/ELF caching (32 contexts NPU2), run().
+    Overrides: verify_results() with rtol/atol/stochastic/correlation checks.
+    Adds: run_test() convenience method that compiles, runs, and verifies.
+    """
+
+    def run_test(
         self,
-        output_binary,
-        kernel,
-        insts,
-    ):
+        npu_kernel,
+        io_args,
+        refs=None,
+        rtol: float = 1e-3,
+        atol: float = 1e-8,
+        max_mismatch_percentage: float = 0.0,
+        min_correlation=None,
+        stochastic_refs=None,
+        verbosity: int = 0,
+        trace_file: str = "trace_data.txt",
+    ) -> int:
         """
-        Constructor for an XRTCompileArtifact
+        Load, run, and verify an NPU kernel.
 
         Args:
-            output_binary: output binary file name/path (.xclbin, .elf, or .txn)
-            kernel: kernel name
-            insts: instruction file name/path
+            npu_kernel: NPUKernel from compile_air().
+            io_args: List of XRTTensor objects (inputs + outputs).
+            refs: dict mapping output index → expected numpy array (dense).
+            rtol: Relative tolerance for floating-point checks.
+            atol: Absolute tolerance for floating-point checks.
+            max_mismatch_percentage: Max % of elements allowed to mismatch.
+            min_correlation: Minimum Pearson correlation (None = disabled).
+            stochastic_refs: List of {"shape", "indices", "values"} dicts.
+            verbosity: Verbosity level.
+            trace_file: Filename to save trace data (if trace_size > 0).
+
+        Returns:
+            0 on pass, -1 on failure.
         """
+        import filelock
+
+        handle = self.load(npu_kernel)
+        with filelock.FileLock("/tmp/npu.lock"):
+            self.run(handle, io_args)
+
+        errors = self.verify_results(
+            io_args,
+            refs=refs or {},
+            rtol=rtol,
+            atol=atol,
+            max_mismatch_percentage=max_mismatch_percentage,
+            min_correlation=min_correlation,
+            stochastic_refs=stochastic_refs,
+            verbosity=verbosity,
+        )
+        if errors == 0:
+            print("PASS!")
+            return 0
+        else:
+            print("failed.")
+            return -1
+
+    @classmethod
+    def verify_results(
+        cls,
+        io_args,
+        refs=None,
+        rtol: float = 1e-3,
+        atol: float = 1e-8,
+        max_mismatch_percentage: float = 0.0,
+        min_correlation=None,
+        stochastic_refs=None,
+        verbosity: int = 0,
+    ) -> int:
+        """
+        Verify kernel outputs against reference data.
+
+        Args:
+            io_args: List of XRTTensor (or numpy array) outputs.
+            refs: dict {index: expected_np_array} for dense checks.
+            rtol: Relative tolerance.
+            atol: Absolute tolerance.
+            max_mismatch_percentage: Max % of mismatches tolerated (0–100).
+            min_correlation: Minimum Pearson correlation (None = disabled).
+            stochastic_refs: List of {"shape","indices","values"} dicts.
+            verbosity: Verbosity level.
+
+        Returns:
+            Number of errors found (0 = pass).
+        """
+        if refs is None:
+            refs = {}
+
+        errors = 0
+        np.set_printoptions(formatter={"int": hex})
+
+        for idx, expected in refs.items():
+            raw = io_args[idx]
+            actual = raw.numpy() if hasattr(raw, "numpy") else np.asarray(raw)
+            actual = np.reshape(actual, expected.shape)
+
+            if verbosity >= 1:
+                print(f"Expected output [{idx}]:", expected)
+                print(f"Actual output [{idx}]:", actual)
+
+            errors += _check_dense(
+                actual,
+                expected,
+                rtol=rtol,
+                atol=atol,
+                idx=idx,
+                max_mismatch_percentage=max_mismatch_percentage,
+                min_correlation=min_correlation,
+            )
+
+        if stochastic_refs:
+            num_dense = len(refs)
+            for i, sref in enumerate(stochastic_refs):
+                raw = io_args[num_dense + i]
+                actual = raw.numpy() if hasattr(raw, "numpy") else np.asarray(raw)
+                actual = np.reshape(actual, sref["shape"])
+
+                if verbosity >= 1:
+                    print(f"Stochastic expected [{i}]: shape={sref['shape']}")
+                    print(f"Stochastic actual [{i}]:", actual)
+
+                errors += _check_stochastic(
+                    actual,
+                    sref,
+                    rtol=rtol,
+                    atol=atol,
+                    idx=i,
+                    max_mismatch_percentage=max_mismatch_percentage,
+                )
+
+        return errors
+
+
+# ---------------------------------------------------------------------------
+# Module-level singleton
+# ---------------------------------------------------------------------------
+
+_air_runtime = None
+
+
+def get_air_runtime() -> AirRuntime:
+    """Return the process-level AirRuntime singleton."""
+    global _air_runtime
+    if _air_runtime is None:
+        if not _HAS_AIE_RUNTIME:
+            raise AirBackendError(
+                "aie.utils (mlir-aie) is not available. "
+                "Install mlir-aie to use AirRuntime."
+            )
+        _air_runtime = AirRuntime()
+    return _air_runtime
+
+
+# ---------------------------------------------------------------------------
+# Internal helpers for verification
+# ---------------------------------------------------------------------------
+
+
+def _check_dense(
+    actual, expected, rtol, atol, idx, max_mismatch_percentage, min_correlation
+):
+    """Dense element-wise check. Returns number of errors (0 or 1)."""
+    total_elements = expected.size
+
+    if expected.dtype in [np.float16, np.float32, np.float64, bfloat16]:
+        if expected.dtype == bfloat16:
+            expected = expected.astype(np.float64)
+            actual = actual.astype(np.float64)
+
+        # Element-wise tolerance check
+        close_mask = np.isclose(actual, expected, rtol=rtol, atol=atol)
+        mismatch_indices = np.where(~close_mask)
+        num_mismatches = len(mismatch_indices[0])
+        max_acceptable = int(total_elements * max_mismatch_percentage / 100)
+
+        elementwise_ok = num_mismatches <= max_acceptable
+        if not elementwise_ok:
+            print(f"ERROR: Output {idx} does not meet expected output.")
+            print(f"Shape: {expected.shape}")
+            if total_elements > 0:
+                print(
+                    f"Mismatches: {num_mismatches} / {total_elements} elements "
+                    f"({100*num_mismatches/total_elements:.2f}%)"
+                )
+            if max_acceptable > 0:
+                print(f"Max acceptable: {max_acceptable} ({max_mismatch_percentage}%)")
+            _print_mismatches_dense(actual, expected, mismatch_indices, num_mismatches)
+
+        # Correlation check
+        corr_ok = True
+        if min_correlation is not None and total_elements > 0:
+            corr = float(np.corrcoef(actual.flatten(), expected.flatten())[0, 1])
+            print(
+                f"Output {idx} correlation: {corr:.6f} (threshold: {min_correlation})"
+            )
+            if not np.isfinite(corr) or corr < min_correlation:
+                corr_ok = False
+                print(
+                    f"ERROR: Output {idx} correlation {corr:.6f} below threshold {min_correlation}"
+                )
+
+        return 0 if (elementwise_ok and corr_ok) else 1
+    else:
+        if not np.array_equal(actual, expected):
+            print(f"ERROR: Output {idx} does not meet expected output.")
+            mismatch_mask = actual != expected
+            mismatch_indices = np.where(mismatch_mask)
+            num_mismatches = len(mismatch_indices[0])
+            print(f"Shape: {expected.shape}")
+            if total_elements > 0:
+                print(
+                    f"Mismatches: {num_mismatches} / {total_elements} elements "
+                    f"({100*num_mismatches/total_elements:.2f}%)"
+                )
+            _print_mismatches_dense(actual, expected, mismatch_indices, num_mismatches)
+            return 1
+        return 0
+
+
+def _print_mismatches_dense(actual, expected, mismatch_indices, num_mismatches):
+    max_display = 20
+    print(f"First {min(max_display, num_mismatches)} mismatched locations:")
+    for j in range(min(max_display, num_mismatches)):
+        idx_t = tuple(dim[j] for dim in mismatch_indices)
+        if np.issubdtype(expected.dtype, np.floating):
+            print(
+                f"  Index {idx_t}: expected={expected[idx_t]}, actual={actual[idx_t]}, "
+                f"diff={abs(actual[idx_t] - expected[idx_t])}"
+            )
+        else:
+            print(
+                f"  Index {idx_t}: expected={expected[idx_t]}, actual={actual[idx_t]}"
+            )
+    if num_mismatches > max_display:
+        print(f"  ... and {num_mismatches - max_display} more mismatches")
+
+
+def _check_stochastic(actual, sref, rtol, atol, idx, max_mismatch_percentage):
+    """Stochastic spot-check. Returns number of errors (0 or 1)."""
+    if sref["values"][0].dtype in [np.float16, np.float32, np.float64, bfloat16]:
+        values = sref["values"]
+        if values[0].dtype == bfloat16:
+            values = values.astype(np.float64)
+            actual = actual.astype(np.float64)
+        actual_stochastic = actual[tuple(sref["indices"])]
+        close_mask = np.isclose(actual_stochastic, values, rtol=rtol, atol=atol)
+        mismatch_positions = np.where(~close_mask)[0]
+        num_mismatches = len(mismatch_positions)
+        total_elements = len(values)
+        max_acceptable = int(total_elements * max_mismatch_percentage / 100)
+        if num_mismatches > max_acceptable:
+            print(f"ERROR: Stochastic output {idx} does not meet expected output.")
+            print(f"Shape: {sref['shape']}")
+            print(f"Stochastic check: {total_elements} sampled elements")
+            print(
+                f"Mismatches: {num_mismatches} / {total_elements} elements "
+                f"({100*num_mismatches/total_elements:.2f}%)"
+            )
+            if max_acceptable > 0:
+                print(f"Max acceptable: {max_acceptable} ({max_mismatch_percentage}%)")
+            max_display = 20
+            print(f"First {min(max_display, num_mismatches)} mismatched locations:")
+            for j in range(min(max_display, num_mismatches)):
+                pos = mismatch_positions[j]
+                idx_t = tuple(dim[pos] for dim in sref["indices"])
+                exp_val = values[pos]
+                act_val = actual_stochastic[pos]
+                print(
+                    f"  Index {idx_t}: expected={exp_val}, actual={act_val}, "
+                    f"diff={abs(act_val - exp_val)}"
+                )
+            if num_mismatches > max_display:
+                print(f"  ... and {num_mismatches - max_display} more mismatches")
+            return 1
+        return 0
+    else:
+        actual_stochastic = actual[tuple(sref["indices"])]
+        if not np.array_equal(actual_stochastic, sref["values"]):
+            print(f"ERROR: Stochastic output {idx} does not meet expected output.")
+            mismatch_mask = actual_stochastic != sref["values"]
+            mismatch_positions = np.where(mismatch_mask)[0]
+            num_mismatches = len(mismatch_positions)
+            total_elements = len(sref["values"])
+            print(f"Shape: {sref['shape']}")
+            print(f"Stochastic check: {total_elements} sampled elements")
+            if total_elements > 0:
+                print(
+                    f"Mismatches: {num_mismatches} / {total_elements} elements "
+                    f"({100*num_mismatches/total_elements:.2f}%)"
+                )
+            max_display = 20
+            print(f"First {min(max_display, num_mismatches)} mismatched locations:")
+            for j in range(min(max_display, num_mismatches)):
+                pos = mismatch_positions[j]
+                idx_t = tuple(dim[pos] for dim in sref["indices"])
+                exp_val = sref["values"][pos]
+                act_val = actual_stochastic[pos]
+                print(f"  Index {idx_t}: expected={exp_val}, actual={act_val}")
+            if num_mismatches > max_display:
+                print(f"  ... and {num_mismatches - max_display} more mismatches")
+            return 1
+        return 0
+
+
+# ---------------------------------------------------------------------------
+# Backward compatibility shims
+# ---------------------------------------------------------------------------
+
+
+class XRTCompileArtifact:
+    """
+    Deprecated. Use NPUKernel from compile_air() instead.
+
+    This shim wraps NPUKernel so existing code that unpacks
+    .output_binary / .kernel / .insts still works.
+    """
+
+    def __init__(self, output_binary, kernel, insts):
         self.output_binary = output_binary
         self.kernel = kernel
         self.insts = insts
 
 
-class XRTBackend(AirBackend):
-    """Main entry-point for the xrt based AIR backend."""
+class XRTBackend:
+    """
+    Deprecated. Use compile_air() + get_air_runtime() instead.
+
+    This shim delegates to compile_air() and get_air_runtime() so
+    existing code continues to work without modification.
+    """
 
     def __init__(
         self,
@@ -53,11 +693,11 @@ def __init__(
         target_device: str = None,
         omit_while_true_loop: bool = False,
         omit_pingpong: str = "",
-        lower_linalg_to_func: str = None,
+        lower_linalg_to_func=None,
         air_loop_fusion: bool = False,
-        runtime_loop_tiling_sizes: list[int] = [],
+        runtime_loop_tiling_sizes=None,
         omit_auto_broadcast: bool = False,
-        channel_multiplexing: list[str] = [],
+        channel_multiplexing=None,
         use_lock_race_condition_fix: bool = False,
         trace_offset: int = 0,
         trace_size: int = 0,
@@ -70,51 +710,21 @@ def __init__(
         debug_ir: bool = False,
         bf16_emulation: bool = False,
     ):
-        """Constructor for XRTBackend
-
-        Args:
-            verbose: verbose output
-            target_device: specify target device explicitly ("npu1", "npu2", etc.). If None, will attempt auto-detection via xrt-smi. This parameter is useful when compiling without XRT installed.
-            omit_while_true_loop: configure aircc to omit the while true loop it traditionally emits.
-            omit_pingpong: configure aircc to omit the generation of ping-pong buffering for specific memory levels. Supported values: "", "L1", "L2", "all". Empty string means no omission (default).
-            lower_linalg_to_func: configure aircc to lower linalg.generic to function calls, or loops.
-            air_loop_fusion: configure aircc to add air-loop-fusion experimental pass.
-            runtime_loop_tiling_sizes: configure aircc to add extra runtime loop tiling using the experimental affine-loop-opt pass.
-            omit_auto_broadcast: configure aircc to omit the detection and lowering of broadcast data movements.
-            channel_multiplexing: configure aircc to perform air channel multiplexing on specified memroy spaces.
-            use_lock_race_condition_fix: configure aircc to enable a fix for lock race condition which protects against race condition.
-            trace_offset: configure aircc to stream out profiling traces at outputs, starting from the specified offset.
-            trace_size: configure aircc to stream out profiling traces at outputs, with specified trace data size.
-            output_format: configure aircc to produce output binary in to one of the following formats: [xclbin, txn, elf].
-            kernel_name: configure aircc to package the kernel with the specified name.
-            instance_name: configure aircc to package the kernel with specified instance name in xclbin metadata.
-            kernel_id: configure aircc to package the kernel with specified kernel id in xclbin file.
-            xclbin_input: configure aircc to package the kernel into an existing xclbin with specified xclbin file name.
-            num_device_cols: number of device columns to confine the design within (0 means entire device, default).
-                For npu1 (4 columns total): valid values are 0 (entire device), 1, 2, 3
-                For npu2 (8 columns total): valid values are 0 (entire device), 1, 2, 3, 4, 5, 6, 7
-            debug_ir: enable debug mode to emit IR after each individual pass for fine-grained inspection.
-                IRs are saved to <tmpdir>/debug_ir/ with sequence numbers.
-            bf16_emulation: emulate f32 vector arithmetic using bf16 operations.
-        """
-        super().__init__()
         self.verbose = verbose
         self.target_device = target_device
-        self.omit_while_true_loop = omit_while_true_loop
-        # Support backward compatibility: convert True to "all", False to ""
         if isinstance(omit_pingpong, bool):
             self.omit_pingpong = "all" if omit_pingpong else ""
         else:
             self.omit_pingpong = omit_pingpong
+        self.omit_while_true_loop = omit_while_true_loop
         self.lower_linalg_to_func = lower_linalg_to_func
         self.air_loop_fusion = air_loop_fusion
-        self.runtime_loop_tiling_sizes = runtime_loop_tiling_sizes
+        self.runtime_loop_tiling_sizes = runtime_loop_tiling_sizes or []
         self.omit_auto_broadcast = omit_auto_broadcast
-        self.channel_multiplexing = channel_multiplexing
+        self.channel_multiplexing = channel_multiplexing or []
         self.use_lock_race_condition_fix = use_lock_race_condition_fix
         self.trace_offset = trace_offset
         self.trace_size = trace_size
-        self.currently_loaded = False
         self.output_format = output_format
         self.kernel_name = kernel_name
         self.instance_name = instance_name
@@ -123,6 +733,19 @@ def __init__(
         self.num_device_cols = num_device_cols
         self.debug_ir = debug_ir
         self.bf16_emulation = bf16_emulation
+        # Legacy attributes referenced by some callers
+        self._npu_kernel = None
+        self._handle = None
+        self._runtime = None
+        self.currently_loaded = False
+        # These were set as side-effects of load()
+        self.xclbin = None
+        self.elf = None
+        self.device = None
+        self.context = None
+        self.kernel = None
+        self.bo_instr = None
+        self.instr_v = None
 
     def __del__(self):
         self.unload()
@@ -134,467 +757,86 @@ def compile(
         kernel="MLIR_AIE",
         insts="air.insts.bin",
     ):
-        """Compiles an AIR module for the NPU / XRT Runtime with aircc.
-
-        The module is expected to be AIR dialect IR. The input IR is passed directly to aircc.
-
-        Args:
-            air_module: The MLIR module consisting of funcs in the AIR dialect.
-            output_binary_name: base name for the output binary (without extension).
-                Extension is determined by output_format: .xclbin, .elf, or .txn
-            kernel: kernel name to use
-            insts: instruction filename to use
-        Returns:
-            An XRTCompileArtifact object
-        """
+        """Compile an AIR module. Returns XRTCompileArtifact for compat."""
+        npu_kernel = compile_air(
+            air_module,
+            verbose=self.verbose,
+            target_device=self.target_device,
+            omit_while_true_loop=self.omit_while_true_loop,
+            omit_pingpong=self.omit_pingpong,
+            lower_linalg_to_func=self.lower_linalg_to_func,
+            air_loop_fusion=self.air_loop_fusion,
+            runtime_loop_tiling_sizes=self.runtime_loop_tiling_sizes,
+            omit_auto_broadcast=self.omit_auto_broadcast,
+            channel_multiplexing=self.channel_multiplexing,
+            use_lock_race_condition_fix=self.use_lock_race_condition_fix,
+            trace_offset=self.trace_offset,
+            trace_size=self.trace_size,
+            output_format=self.output_format,
+            kernel_name=self.kernel_name,
+            instance_name=self.instance_name,
+            kernel_id=self.kernel_id,
+            xclbin_input=self.xclbin_input,
+            num_device_cols=self.num_device_cols,
+            debug_ir=self.debug_ir,
+            bf16_emulation=self.bf16_emulation,
+            output_binary_name=output_binary_name,
+            insts=insts,
+        )
+        self._npu_kernel = npu_kernel
+        # Build a compat artifact
+        xclbin_path = getattr(npu_kernel, "xclbin_path", output_binary_name)
+        kernel_name = getattr(npu_kernel, "kernel_name", kernel)
+        insts_path = getattr(npu_kernel, "insts_path", insts)
+        return XRTCompileArtifact(xclbin_path, kernel_name, insts_path)
+
+    def load(self, artifact):
+        """Load a compiled artifact. Returns an invoker callable."""
         if self.currently_loaded:
             raise AirBackendError(
-                "Cannot use XRTBackend to compile while the artifact is currently loaded. Call unload() first."
+                "Cannot load while an artifact is currently loaded. Call unload() first."
             )
-
-        # Determine target device: use explicit parameter if provided, otherwise auto-detect
-        if self.target_device is not None:
-            target_device = self.target_device
-            if self.verbose:
-                print(f"Using explicitly specified target device: {target_device}")
-        else:
-            # Try to auto-detect device via xrt-smi
-            target_device = "npu1"  # Default fallback
-            try:
-                import re
-
-                xrtsmi = "/opt/xilinx/xrt/bin/xrt-smi"
-                result = subprocess.run(
-                    [xrtsmi, "examine"], stdout=subprocess.PIPE, stderr=subprocess.PIPE
-                )
-                result = result.stdout.decode("utf-8").split("\n")
-                # Older format is "|[0000:41:00.1]  ||RyzenAI-npu1  |"
-                # Newer format is "|[0000:41:00.1]  |NPU Phoenix  |"
-                p = re.compile(
-                    r"[\|]?(\[.+:.+:.+\]).+\|(RyzenAI-(npu\d)|NPU (\w+))\W*\|"
+        if self._npu_kernel is None:
+            # Reconstruct NPUKernel from the artifact for the case where
+            # compile() was called separately.
+            if _HAS_AIE_RUNTIME:
+                self._npu_kernel = NPUKernel(
+                    artifact.output_binary,
+                    artifact.insts,
+                    kernel_name=artifact.kernel,
                 )
-                for l in result:
-                    m = p.match(l)
-                    if not m:
-                        continue
-                    if self.verbose:
-                        print("Found Ryzen AI device:", m.group(1))
-                    model = "unknown"
-                    if m.group(3):
-                        model = str(m.group(3))
-                    if m.group(4):
-                        model = str(m.group(4))
-                    if self.verbose:
-                        print(f"\tmodel: '{model}'")
-                    if model in ["npu1", "Phoenix"]:
-                        target_device = "npu1"
-                    elif model in ["npu4", "Strix"]:
-                        target_device = "npu2"
-                    else:
-                        print("WARNING: xrt-smi reported unknown NPU model '{model}'.")
-                    break
-            except Exception as e:
-                if self.verbose:
-                    print("Failed to run xrt-smi, using default target device")
-                    print(e)
-
-        # Validate output_format compatibility with target device
-        if self.output_format == "elf" and "npu1" in target_device:
-            raise AirBackendError(
-                f"output_format='elf' is not supported for {target_device} target. "
-                "ELF output format is only supported on npu2 and later devices."
-            )
-
-        # Apply user-specified device column configuration if provided
-        if self.num_device_cols > 0:
-            # Validate column count based on detected device
-            max_cols = 4 if target_device == "npu1" else 8
-            if self.num_device_cols > max_cols - 1:
-                raise AirBackendError(
-                    f"Invalid num_device_cols value: {self.num_device_cols}. "
-                    f"For {target_device}, valid values are 0 (entire device) or 1-{max_cols-1}"
-                )
-            base_device = target_device
-            target_device = f"{target_device}_{self.num_device_cols}col"
-            if self.verbose:
-                print(
-                    f"Confining design to {self.num_device_cols} column(s) of {base_device} device: {target_device}"
-                )
-
-        import os, site, glob
-
-        # Try to get peano package dir from environment variable, fallback to site-packages
-        peano_package_dir = os.environ.get("PEANO_INSTALL_DIR", "")
-
-        if peano_package_dir and os.path.isdir(peano_package_dir):
-            print(
-                "XRTBackend: llvm-aie package detected via PEANO_INSTALL_DIR:",
-                peano_package_dir,
-            )
-
-        # Determine output file extension based on output_format
-        if self.output_format == "elf":
-            output_binary = f"{output_binary_name}.elf"
-        elif self.output_format == "txn":
-            output_binary = f"{output_binary_name}.txn"
-        else:  # xclbin (default)
-            output_binary = f"{output_binary_name}.xclbin"
-
-        with air.ir.Context():
-
-            if self.verbose:
-                print("AIR Module:")
-                print(air_module)
-
-            aircc_options = [
-                "--device",
-                target_device,
-                "air.mlir",
-            ]
-
-            # Add output file options based on format
-            if self.output_format == "elf":
-                aircc_options += ["--elf-name", output_binary]
-                # Note: ELF mode features (main device wrapper, load_pdi) are
-                # automatically enabled by --output-format=elf in aircc
-            else:
-                aircc_options += ["-o", output_binary]
-                aircc_options += ["-i", insts]
-
-            for s in self.runtime_loop_tiling_sizes:
-                aircc_options += [f"--air-runtime-loop-tiling-sizes={s}"]
-
-            if self.verbose:
-                aircc_options = aircc_options + ["-v"]
-
-            if self.omit_while_true_loop:
-                aircc_options += ["--omit-while-true-loop"]
-
-            if self.omit_pingpong:
-                # Handle both bool (True -> "all") and string ("L1", "L2", "all")
-                pp_val = (
-                    "all" if self.omit_pingpong is True else str(self.omit_pingpong)
-                )
-                aircc_options += [f"--omit-ping-pong-transform={pp_val}"]
-
-            if self.lower_linalg_to_func:
-                aircc_options += ["--lower-linalg-to-func"]
-                aircc_options += [self.lower_linalg_to_func]
-
-            if self.air_loop_fusion:
-                aircc_options += ["--air-loop-fusion"]
-
-            if self.omit_auto_broadcast:
-                aircc_options += ["--omit-auto-broadcast"]
-
-            if len(self.channel_multiplexing) != 0:
-                for ch in self.channel_multiplexing:
-                    aircc_options += [f"--air-channel-multiplexing={ch}"]
-
-            if self.use_lock_race_condition_fix:
-                aircc_options += ["--use-lock-race-condition-fix"]
-
-            if self.trace_size != 0:
-                aircc_options += ["-trace-size"]
-                aircc_options += [str(self.trace_size)]
-                aircc_options += ["-trace-offset"]
-                aircc_options += [str(self.trace_offset)]
-
-            if self.output_format != "":
-                aircc_options += ["--output-format"]
-                aircc_options += [self.output_format]
-            if self.kernel_name != "":
-                aircc_options += ["--xclbin-kernel-name"]
-                aircc_options += [self.kernel_name]
-            if self.instance_name != "":
-                aircc_options += ["--xclbin-instance-name"]
-                aircc_options += [self.instance_name]
-            if self.kernel_id != "":
-                aircc_options += ["--xclbin-kernel-id"]
-                aircc_options += [self.kernel_id]
-            if self.xclbin_input != "":
-                aircc_options += ["--xclbin-input"]
-                aircc_options += [self.xclbin_input]
-            if peano_package_dir != "":
-                aircc_options += ["--peano"]
-                aircc_options += [peano_package_dir]
-                aircc_options += ["--no-xchesscc"]
-                aircc_options += ["--no-xbridge"]
-            else:
-                aircc_options += ["--xchesscc"]
-                aircc_options += ["--xbridge"]
-
-            if self.debug_ir:
-                aircc_options += ["--debug-ir"]
-
-            if self.bf16_emulation:
-                aircc_options += ["--bf16-emulation"]
-
-            if self.verbose:
-                print("Running aircc with options:", " ".join(aircc_options))
-
-            # Write the in-memory module to the input file expected by aircc
-            with open("air.mlir", "w") as f:
-                f.write(str(air_module))
-
-            # Invoke the C++ aircc binary
-            aircc_exe = shutil.which("aircc")
-            if not aircc_exe:
-                raise AirBackendError(
-                    "aircc binary not found in PATH. "
-                    "Ensure mlir-air is installed and aircc is on PATH."
-                )
-            result = subprocess.run(
-                [aircc_exe] + aircc_options,
-                capture_output=True,
-                text=True,
-            )
-            if result.returncode != 0:
-                error_msg = result.stderr if result.stderr else result.stdout
-                raise AirBackendError(f"aircc compilation failed:\n{error_msg}")
-
-        # For ELF mode, the kernel identifier is "main:instance_name"
-        # This is used when loading the ELF via xrt.ext.kernel()
-        if self.output_format == "elf" and self.instance_name != "":
-            elf_kernel = f"main:{self.instance_name}"
-        else:
-            elf_kernel = kernel
-
-        return XRTCompileArtifact(output_binary, elf_kernel, insts)
-
-    def compile_from_torch_mlir(
-        self,
-        imported_module,
-        pipeline=None,
-        verbose=False,
-    ):
-        import torch_mlir
-        import torch_mlir.passmanager
-
-        if type(imported_module) is torch_mlir.ir.Module:
-            with imported_module.operation.context:
-                pm = torch_mlir.passmanager.PassManager.parse(
-                    "builtin.module(refback-mlprogram-bufferize)"
-                )
-                pm.run(imported_module.operation)
-
-        with air.ir.Context():
-            linalg_module = air.ir.Module.parse(str(imported_module))
-            pm = air.passmanager.PassManager.parse(
-                air.compiler.util.LINALG_TENSOR_TO_MEMREF_PIPELINE
-            )
-            if verbose:
-                print(
-                    "Running MLIR pass pipeline: ",
-                    air.compiler.util.LINALG_TENSOR_TO_MEMREF_PIPELINE,
-                )
-            pm.run(linalg_module.operation)
-
-            if verbose:
-                print("Linalg Module:")
-                print(linalg_module)
-
-            DEFAULT_PIPELINE = (
-                "builtin.module("
-                + ",".join(
-                    [
-                        "buffer-results-to-out-params",
-                        "air-linalg-codegen",
-                        "air-par-to-herd{depth=-1}",
-                        "air-par-to-launch{has-air-segment=true}",
-                        "air-copy-to-dma",
-                        "canonicalize",
-                        "cse",
-                    ]
-                )
-                + ")"
-            )
-            if pipeline is None:
-                pipeline = DEFAULT_PIPELINE
-
-            if callable(pipeline):
-                air_module = pipeline(linalg_module)
-            else:
-                pm = air.passmanager.PassManager.parse(pipeline)
-                pm.run(linalg_module.operation)
-                air_module = linalg_module
-
-            if verbose:
-                print("Air Module:")
-                print(air_module)
-
-        return self.compile(air_module)
-
-    def load(self, artifact: XRTCompileArtifact):
-        """Load a compiled artifact into the air runtime.
-
-        Args:
-            artifact: The result of calling compile with XRTBackend on an MLIR-AIR module.
-                Supports both xclbin and ELF formats.
-
-        Returns: A callable that can be used to invoke the loaded module.
-            The callable takes a list of numpy arrays. Each numpy array is
-            assumed to be an input/output tensor. The callable also returns a
-            list of numpy arrays, one for each tensor.
-        """
-        # Try to import pyxrt - it's only needed for load(), not compile()
-        try:
-            import pyxrt as xrt
-        except ImportError:
-            raise AirBackendError(
-                "XRT runtime (pyxrt) is not available. "
-                "The compile() method can generate artifacts without XRT, "
-                "but load() requires XRT to be installed for hardware execution. "
-                "To compile without XRT, use compile() and specify target_device parameter. "
-                "Install XRT to use load() for hardware execution."
-            )
-
-        if self.currently_loaded:
-            raise AirBackendError(
-                "Cannot use XRTBackend to compile while the artifact is currently loaded. Call unload() first."
-            )
+        self._runtime = get_air_runtime()
+        self._handle = self._runtime.load(self._npu_kernel)
+        self.currently_loaded = True
 
-        if not os.path.isfile(artifact.output_binary):
-            raise AirBackendError(
-                f"Cannot load XRTCompileArtifact because {artifact.output_binary} file does not exist"
-            )
+        # Return a callable that mimics the old invoker interface.
+        # Use _tensor() factory (selects XRTTensor or CPUOnlyTensor based on
+        # pyxrt availability) rather than XRTTensor() directly.
+        runtime = self._runtime
+        handle = self._handle
 
-        # Determine the loading mode based on file extension
-        is_elf = artifact.output_binary.endswith(".elf")
-
-        # create the device
-        self.device = xrt.device(0)
-
-        if is_elf:
-            # ELF loading path - uses experimental APIs
-            # No instruction file needed for ELF (instructions embedded in ELF)
-            try:
-                self.elf = xrt.elf(artifact.output_binary)
-                self.context = xrt.hw_context(self.device, self.elf)
-                self.kernel = xrt.ext.kernel(self.context, artifact.kernel)
-            except Exception as e:
-                raise AirBackendError(
-                    f"Failed to load ELF kernel for XRT from '{artifact.output_binary}' "
-                    f"with kernel name '{artifact.kernel}'. "
-                    "Ensure this file is a valid ELF binary compiled for the target device "
-                    "and that it contains a kernel symbol matching the provided name."
-                ) from e
-            self.bo_instr = None  # Not needed for ELF
-            self.instr_v = None
-
-            def invoker(*args):
-                sizes_in_bytes = [a.size * a.itemsize for a in args]
-                # Use xrt.ext.bo for ELF mode (simpler, no group_id needed)
-                bos = [xrt.ext.bo(self.device, s) for s in sizes_in_bytes]
-
-                for i, a in enumerate(args):
-                    if a.dtype == bfloat16:
-                        a = a.view(np.int16)
-                    bos[i].write(a, 0)
-                    bos[i].sync(xrt.xclBOSyncDirection.XCL_BO_SYNC_BO_TO_DEVICE)
-
-                # Use xrt.run for ELF mode
-                run = xrt.run(self.kernel)
-                for i, bo in enumerate(bos):
-                    run.set_arg(i, bo)
-                run.start()
-                run.wait2()
-
-                for i, a in enumerate(args):
-                    bos[i].sync(xrt.xclBOSyncDirection.XCL_BO_SYNC_BO_FROM_DEVICE)
-                return tuple(
-                    [
-                        bos[i].read(s, 0).view(args[i].dtype)
-                        for i, s in enumerate(sizes_in_bytes)
-                    ]
-                )
+        def invoker(*args):
+            import filelock
 
-        else:
-            # xclbin loading path - original implementation
-            if not os.path.isfile(artifact.insts):
-                raise AirBackendError(
-                    f"Cannot load XRTCompileArtifact because {artifact.insts} insts file does not exist"
-                )
+            tensors = [_tensor(a) for a in args]
+            with filelock.FileLock("/tmp/npu.lock"):
+                runtime.run(handle, tensors)
+            return tuple(t.numpy() for t in tensors)
 
-            self.xclbin = xrt.xclbin(artifact.output_binary)
-            self.device.register_xclbin(self.xclbin)
-            self.context = xrt.hw_context(self.device, self.xclbin.get_uuid())
-
-            # find and load the kernel
-            kernels = self.xclbin.get_kernels()
-            try:
-                xkernel = [k for k in kernels if artifact.kernel in k.get_name()][0]
-            except:
-                raise AirBackendError(
-                    f"Kernel '{artifact.kernel}' not found in '{artifact.output_binary}'"
-                )
-            self.kernel = xrt.kernel(self.context, xkernel.get_name())
-
-            # load the instructions as a numpy array
-            with open(artifact.insts, "rb") as f:
-                instr_data = f.read()
-                self.instr_v = np.frombuffer(instr_data, dtype=np.uint32)
-
-            self.bo_instr = xrt.bo(
-                self.device,
-                len(self.instr_v) * 4,
-                xrt.bo.cacheable,
-                self.kernel.group_id(1),
-            )
-            self.bo_instr.write(self.instr_v, 0)
-
-            def invoker(*args):
-                # limit arg length to 5
-                if len(args) > 5:
-                    raise ValueError("Too many arguments")
-                sizes_in_bytes = [a.size * a.itemsize for a in args]
-                bos = [
-                    xrt.bo(
-                        self.device, s, xrt.bo.host_only, self.kernel.group_id(i + 3)
-                    )
-                    for i, s in enumerate(sizes_in_bytes)
-                ]
-
-                self.bo_instr.sync(xrt.xclBOSyncDirection.XCL_BO_SYNC_BO_TO_DEVICE)
-                for i, a in enumerate(args):
-                    if a.dtype == bfloat16:
-                        # store bfloat16 in binary as int16
-                        a = a.view(np.int16)
-                    bos[i].write(a, 0)
-                    bos[i].sync(xrt.xclBOSyncDirection.XCL_BO_SYNC_BO_TO_DEVICE)
-
-                h = self.kernel(3, self.bo_instr, len(self.instr_v), *bos)
-                h.wait()
-
-                for i, a in enumerate(args):
-                    bos[i].sync(xrt.xclBOSyncDirection.XCL_BO_SYNC_BO_FROM_DEVICE)
-                return tuple(
-                    [
-                        bos[i].read(s, 0).view(args[i].dtype)
-                        for i, s in enumerate(sizes_in_bytes)
-                    ]
-                )
-
-        self.currently_loaded = True
         return invoker
 
     def compile_and_load(self, module):
-        """
-        Compile and load a module in one step.
-
-        Args:
-            air_module: The MLIR module consisting of funcs in the AIR dialect.
-
-        Returns: A callable that can be used to invoke the loaded module.
-            The callable takes a list of numpy arrays. Each numpy array is
-            assumed to be an input/output tensor. The callable also returns a
-            list of numpy arrays, one for each tensor.
-        """
+        """Compile and load in one step."""
         c = self.compile(module)
         return self.load(c)
 
     def unload(self):
-        """Unload any loaded module and shutdown the air runtime."""
+        """Unload any loaded module."""
+        self._handle = None
+        self._runtime = None
+        self._npu_kernel = None
+        self.currently_loaded = False
+        # Clear legacy attributes
         self.kernel = None
         self.context = None
         self.xclbin = None
@@ -602,4 +844,3 @@ def unload(self):
         self.device = None
         self.bo_instr = None
         self.instr_v = None
-        self.currently_loaded = False
diff --git a/python/air/backend/xrt_runner.py b/python/air/backend/xrt_runner.py
index b9fe9aae0..0a9dc5925 100644
--- a/python/air/backend/xrt_runner.py
+++ b/python/air/backend/xrt_runner.py
@@ -3,14 +3,57 @@
 # Copyright (C) 2024, Advanced Micro Devices, Inc.
 # SPDX-License-Identifier: MIT
 
-import numpy as np
-from .xrt import XRTBackend
-from air.dialects.air import *
+"""
+High-level runner helpers for mlir-air programming examples and tests.
+
+Public API
+----------
+run_on_npu(args, mlir_module, inputs, instance_name, ...)
+    Compile (and optionally run) an AIR module, dispatching on
+    args.compile_mode.  Replaces the boilerplate if/elif block found
+    in every example's __main__.
+
+make_air_parser(description, prog)
+    Return an ArgumentParser pre-populated with the four universal flags.
+
+type_mapper(np_dtype)
+    Map a numpy dtype to the corresponding MLIR type inside a module context.
+
+TYPE_MAP_DICT
+    The underlying defaultdict used by type_mapper.
+"""
+
 import filelock
-from typing import List
+import numpy as np
 from collections import defaultdict
 from ml_dtypes import bfloat16
-import timeit
+from typing import List
+
+from air.dialects.air import *
+
+from .abc import AirBackendError
+from .xrt import (
+    compile_air,
+    get_air_runtime,
+    AirRuntime,
+    XRTTensor,
+    XRTBackend,
+    XRTCompileArtifact,
+)
+
+try:
+    import aie.utils as _aie_utils
+
+    _tensor = _aie_utils.tensor
+    _has_aie_utils = True
+except ImportError:
+    _has_aie_utils = False
+    _tensor = None
+
+
+# ---------------------------------------------------------------------------
+# Type mapping helpers (unchanged — used by many callers)
+# ---------------------------------------------------------------------------
 
 TYPE_MAP_DICT = defaultdict(
     lambda: None,
@@ -35,36 +78,217 @@
 
 
 def type_mapper(np_dtype):
-    """
-    This function is meant to run within a module context (e.g., with a function wrapped with @build_module)
-    args:
-        np_dtype: the numpy data type to map
-    return:
-        The data type to run on the npu
+    """Map a numpy dtype to the MLIR type for use inside a module context.
+
+    Args:
+        np_dtype: The numpy data type to map.
+
+    Returns:
+        The corresponding MLIR type.
+
+    Raises:
+        AirBackendError: If the dtype has no known mapping.
     """
     xrt_dtype = TYPE_MAP_DICT[np_dtype]()
 
     if xrt_dtype is None:
         raise AirBackendError(f"numpy data type {np_dtype} has no default mapping")
     elif xrt_dtype.width / 8 != np.dtype(np_dtype).itemsize:
-        # This is a sanity check on the TYPE_MAP_DICT rather than a check on the user input
         raise AirBackendError(
-            f"Python data type has width {xrt_dtype.width / 8} but numpy data type has width {np.dtype(np_dtype).itemsize}"
+            f"Python data type has width {xrt_dtype.width / 8} but numpy data type "
+            f"has width {np.dtype(np_dtype).itemsize}"
         )
     return xrt_dtype
 
 
+# ---------------------------------------------------------------------------
+# Argument parser factory (unchanged)
+# ---------------------------------------------------------------------------
+
+
+def make_air_parser(description, prog="run.py"):
+    """Return an ArgumentParser pre-populated with the four universal flags.
+
+    Flags added:
+        -v / --verbose
+        -p / --print-module-only
+        --compile-mode  {compile-only, compile-and-run}
+        --output-format {xclbin, elf}
+
+    The caller adds any example-specific arguments afterwards.
+    """
+    import argparse
+
+    parser = argparse.ArgumentParser(prog=prog, description=description)
+    parser.add_argument("-v", "--verbose", action="store_true")
+    parser.add_argument("-p", "--print-module-only", action="store_true")
+    parser.add_argument(
+        "--compile-mode",
+        type=str,
+        choices=["compile-only", "compile-and-run"],
+        dest="compile_mode",
+        default="compile-and-run",
+    )
+    parser.add_argument(
+        "--output-format",
+        type=str,
+        choices=["xclbin", "elf"],
+        default="xclbin",
+        dest="output_format",
+    )
+    return parser
+
+
+# ---------------------------------------------------------------------------
+# check_print_module helper (kept here for backward compat)
+# ---------------------------------------------------------------------------
+
+
+def check_print_module(mlir_module, args):
+    """Print the MLIR module and exit if --print-module-only was passed."""
+    if args.print_module_only:
+        print(mlir_module)
+        exit(0)
+
+
+# ---------------------------------------------------------------------------
+# run_on_npu() — the main dispatch helper
+# ---------------------------------------------------------------------------
+
+
+def run_on_npu(
+    args,
+    mlir_module,
+    inputs,
+    instance_name,
+    expected_outputs=None,
+    stochastic_expected_outputs=None,
+    rtol: float = 1e-3,
+    atol: float = 1e-8,
+    runtime_loop_tiling_sizes=None,
+    max_mismatch_percentage: float = 0.0,
+    min_correlation=None,
+    # Extra compile_air kwargs forwarded as-is
+    **compile_kwargs,
+) -> int:
+    """Compile (and optionally run+verify) an AIR module.
+
+    Dispatches on args.compile_mode:
+      - "compile-only"    → compile, write artifacts, return 0
+      - "compile-and-run" → compile, run on NPU, verify, return exit code
+
+    Args:
+        args: Parsed argparse namespace (must have .verbose, .compile_mode,
+              .output_format, and optionally .print_module_only).
+        mlir_module: MLIR module from build_module() / Module.parse().
+        inputs: List of numpy input arrays.
+        instance_name: xclbin instance name string.
+        expected_outputs: List of numpy reference arrays (dense check).
+        stochastic_expected_outputs: List of {"shape","indices","values"} dicts.
+        rtol: Relative tolerance forwarded to AirRuntime.run_test().
+        atol: Absolute tolerance forwarded to AirRuntime.run_test().
+        runtime_loop_tiling_sizes: Tiling sizes (default [4, 4]).
+        max_mismatch_percentage: Max % of elements allowed to mismatch.
+        min_correlation: Minimum Pearson correlation (None = disabled).
+        **compile_kwargs: Additional kwargs forwarded to compile_air().
+
+    Returns:
+        int: 0 = pass / compile-only success, -1 = failure.
+    """
+    if runtime_loop_tiling_sizes is None:
+        runtime_loop_tiling_sizes = [4, 4]
+
+    # --print-module-only support
+    if getattr(args, "print_module_only", False):
+        print(mlir_module)
+        return 0
+
+    npu_kernel = compile_air(
+        mlir_module,
+        verbose=args.verbose,
+        output_format=args.output_format,
+        omit_while_true_loop=False,
+        instance_name=instance_name,
+        runtime_loop_tiling_sizes=runtime_loop_tiling_sizes,
+        **compile_kwargs,
+    )
+
+    if args.compile_mode == "compile-only":
+        return 0
+
+    # compile-and-run
+    runtime = get_air_runtime()
+
+    # Build io_args: inputs + zero-initialised output buffers.
+    # Use aie.utils.tensor() so the correct tensor class is picked automatically.
+    input_tensors = [_tensor(a) if _has_aie_utils else a for a in inputs]
+    output_tensors = _make_output_tensors(
+        expected_outputs or [], stochastic_expected_outputs or []
+    )
+    io_args = input_tensors + output_tensors
+
+    # Build refs dict: map output buffer index → expected numpy array
+    refs = {len(inputs) + i: exp for i, exp in enumerate(expected_outputs or [])}
+
+    return runtime.run_test(
+        npu_kernel,
+        io_args,
+        refs=refs,
+        rtol=rtol,
+        atol=atol,
+        max_mismatch_percentage=max_mismatch_percentage,
+        min_correlation=min_correlation,
+        stochastic_refs=stochastic_expected_outputs or [],
+    )
+
+
+def _make_output_tensors(expected_outputs, stochastic_expected_outputs):
+    """Allocate zero-filled tensors matching each expected output spec.
+
+    Uses aie.utils.tensor() so the correct tensor class (XRTTensor when XRT
+    is available, CPUOnlyTensor otherwise) is selected automatically.
+    """
+    tensors = []
+    for exp in expected_outputs:
+        if _has_aie_utils:
+            tensors.append(_tensor(np.zeros(exp.shape, dtype=exp.dtype)))
+        else:
+            tensors.append(np.zeros(exp.shape, dtype=exp.dtype))
+    for sref in stochastic_expected_outputs:
+        dtype = sref["values"].dtype if hasattr(sref["values"], "dtype") else np.float32
+        shape = sref["shape"]
+        if isinstance(shape, int):
+            shape = (shape,)
+        if _has_aie_utils:
+            tensors.append(_tensor(np.zeros(shape, dtype=dtype)))
+        else:
+            tensors.append(np.zeros(shape, dtype=dtype))
+    return tensors
+
+
+# ---------------------------------------------------------------------------
+# Backward compatibility shim — XRTRunner
+# ---------------------------------------------------------------------------
+
+
 class XRTRunner:
+    """
+    Deprecated. Use compile_air() + get_air_runtime() or run_on_npu() instead.
+
+    This shim preserves the old XRTRunner interface so existing call sites
+    continue to work without modification during the migration period.
+    """
+
     def __init__(
         self,
         verbose: bool = False,
         omit_while_true_loop: bool = True,
         omit_pingpong: str = "",
-        lower_linalg_to_func: bool = False,
+        lower_linalg_to_func=None,
         air_loop_fusion: bool = False,
-        runtime_loop_tiling_sizes: list[int] = [],
+        runtime_loop_tiling_sizes: list = None,
         omit_auto_broadcast: bool = False,
-        channel_multiplexing: list[str] = [],
+        channel_multiplexing: list = None,
         use_lock_race_condition_fix: bool = False,
         trace_offset: int = 0,
         trace_size: int = 0,
@@ -79,45 +303,18 @@ def __init__(
         bf16_emulation: bool = False,
         target_device: str = None,
     ):
-        """
-        Args:
-            verbose: verbose output
-            omit_while_true_loop: configure aircc to omit the while true loop it traditionally emits.
-            omit_pingpong: configure aircc to omit the generation of ping-pong buffering for specific memory levels. Supported values: "", "L1", "L2", "all". Empty string means no omission (default).
-            lower_linalg_to_func: configure aircc to lower linalg.generic to function calls, or loops.
-            air_loop_fusion: configure aircc to add air-loop-fusion experimental pass.
-            runtime_loop_tiling_sizes: configure aircc to add extra runtime loop tiling using the experimental affine-loop-opt pass.
-            omit_auto_broadcast: configure aircc to omit the detection and lowering of broadcast data movements.
-            channel_multiplexing: configure aircc to perform air channel multiplexing on specified memroy spaces.
-            use_lock_race_condition_fix: configure aircc to enable a fix for lock race condition which protects against race condition.
-            trace_offset: configure aircc to stream out profiling traces at outputs, starting from the specified offset.
-            trace_size: configure aircc to stream out profiling traces at outputs, with specified trace data size.
-            output_format: configure aircc to produce output binary in to one of the following formats: [xclbin, txn, elf].
-            kernel_name: configure aircc to package the kernel with the specified name.
-            instance_name: configure aircc to package the kernel with specified instance name in xclbin metadata.
-            kernel_id: configure aircc to package the kernel with specified kernel id in xclbin file.
-            xclbin_input: configure aircc to package the kernel into an existing xclbin with specified xclbin file name.
-            trace_file: default filename for saving trace data.
-            num_device_cols: number of device columns to confine the design within (0 means entire device, default).
-                For npu1 (4 columns total): valid values are 0 (entire device), 1, 2, 3
-                For npu2 (8 columns total): valid values are 0 (entire device), 1, 2, 3, 4, 5, 6, 7
-            debug_ir: enable debug mode to emit IR after each individual pass for fine-grained inspection.
-                IRs are saved to <tmpdir>/debug_ir/ with sequence numbers.
-            bf16_emulation: emulate f32 vector arithmetic using bf16 operations.
-            target_device: specify target device explicitly ("npu1", "npu2", etc.). If None, will attempt auto-detection.
-        """
         self.verbose = verbose
-        self.omit_while_true_loop = omit_while_true_loop
-        # Support backward compatibility: convert True to "all", False to ""
+        # Support backward compatibility: convert bool omit_pingpong
         if isinstance(omit_pingpong, bool):
             self.omit_pingpong = "all" if omit_pingpong else ""
         else:
             self.omit_pingpong = omit_pingpong
+        self.omit_while_true_loop = omit_while_true_loop
         self.lower_linalg_to_func = lower_linalg_to_func
         self.air_loop_fusion = air_loop_fusion
-        self.runtime_loop_tiling_sizes = runtime_loop_tiling_sizes
+        self.runtime_loop_tiling_sizes = runtime_loop_tiling_sizes or []
         self.omit_auto_broadcast = omit_auto_broadcast
-        self.channel_multiplexing = channel_multiplexing
+        self.channel_multiplexing = channel_multiplexing or []
         self.use_lock_race_condition_fix = use_lock_race_condition_fix
         self.trace_offset = trace_offset
         self.trace_size = trace_size
@@ -134,34 +331,45 @@ def __init__(
 
     def run_test(
         self,
-        mlir_module: np.ndarray,
+        mlir_module,
         inputs: List[np.ndarray],
-        expected_outputs: List[np.ndarray] = [],
-        stochastic_expected_outputs: List[np.ndarray] = [],
+        expected_outputs: List[np.ndarray] = None,
+        stochastic_expected_outputs: List = None,
         rtol: float = 1e-3,
         atol: float = 1e-8,
         max_mismatch_percentage: float = 0,
-        min_correlation: float = None,
+        min_correlation=None,
         trace_file: str = None,
-    ):
-        """
+    ) -> int:
+        """Compile, run and verify an AIR module.
+
         Args:
-            mlir_module: input mlir module to test.
-            inputs: input matrices.
-            expected_outputs: expected output matrices.
-            stochastic_expected_outputs: expected output matrices stored in sparse coordinates. Expect each matrix to be a dictionary containing "shape", "indices" and "values" fields.
-            rtol: relative error tolerance.
-            atol: absolute error tolerance.
-            max_mismatch_percentage: max percentage (0-100) of elements allowed to exceed tolerance (0 = all must pass, 20 = 20% can fail).
-            min_correlation: minimum Pearson correlation coefficient (0-1) between actual and expected outputs for floating-point data. None disables this check.
-            trace_file: optional override for trace data filename. If None, uses instance default.
+            mlir_module: MLIR module to test.
+            inputs: Input numpy arrays.
+            expected_outputs: Expected dense output arrays.
+            stochastic_expected_outputs: Sparse reference dicts.
+            rtol: Relative tolerance.
+            atol: Absolute tolerance.
+            max_mismatch_percentage: Max % of mismatches tolerated.
+            min_correlation: Min Pearson correlation (None = disabled).
+            trace_file: Override trace data filename.
+
+        Returns:
+            0 on pass, -1 on failure.
         """
+        if expected_outputs is None:
+            expected_outputs = []
+        if stochastic_expected_outputs is None:
+            stochastic_expected_outputs = []
+
         if self.verbose:
-            print("Running module: ")
+            print("Running module:")
             print(mlir_module)
 
-        backend = XRTBackend(
+        npu_kernel = compile_air(
+            mlir_module,
             verbose=self.verbose,
+            target_device=self.target_device,
             omit_while_true_loop=self.omit_while_true_loop,
             omit_pingpong=self.omit_pingpong,
             lower_linalg_to_func=self.lower_linalg_to_func,
@@ -180,393 +388,159 @@ def run_test(
             num_device_cols=self.num_device_cols,
             debug_ir=self.debug_ir,
             bf16_emulation=self.bf16_emulation,
-            target_device=self.target_device,
         )
 
-        # Use per-test trace file if provided, otherwise use instance default
-        active_trace_file = trace_file if trace_file is not None else self.trace_file
-
-        # run the module - slots are input/output for now, assume non-overlapping inputs/outputs
-        # Handle different scenarios for trace data
+        # Handle trace mode separately (uses legacy raw-numpy path for now)
         if self.trace_size > 0:
-            if expected_outputs:
-                # Case 1: Both outputs and trace
-                # Add trace_size bytes to first output
-                total_bytes = expected_outputs[0].nbytes + self.trace_size
-                first_output_with_trace = np.zeros(total_bytes, dtype=np.uint8)
-                remaining_outputs = [
-                    np.zeros(o.shape, o.dtype) for o in expected_outputs[1:]
-                ]
-                output_placeholders = [first_output_with_trace] + remaining_outputs
-                if self.verbose:
-                    print(
-                        f"Allocated {total_bytes} bytes for first output + {self.trace_size} bytes for trace data"
-                    )
-                # Record the expected_outputs[0]'s shape and dtype, to be used to split actual outputs from trace.
-                expected_outputs_0_shape = expected_outputs[0].shape
-                expected_outputs_0_dtype = expected_outputs[0].dtype
-            elif stochastic_expected_outputs:
-                # Case 2: Stochastic outputs and trace
-                first_output_elements = np.prod(stochastic_expected_outputs[0]["shape"])
-                first_output_bytes = (
-                    first_output_elements
-                    * stochastic_expected_outputs[0]["values"][0].dtype.itemsize
-                )
-                total_bytes = first_output_bytes + self.trace_size
-                first_output_with_trace = np.zeros(total_bytes, dtype=np.uint8)
-                remaining_outputs = [
-                    np.zeros(o["shape"], o["values"][0].dtype)
-                    for o in stochastic_expected_outputs[1:]
-                ]
-                output_placeholders = [first_output_with_trace] + remaining_outputs
-                if self.verbose:
-                    print(
-                        f"Allocated {first_output_bytes} bytes for first stochastic output + {self.trace_size} bytes for trace data"
-                    )
-                # Record the expected_outputs[0]'s shape and dtype, to be used to split actual outputs from trace.
-                expected_outputs_0_shape = stochastic_expected_outputs[0]["shape"]
-                expected_outputs_0_dtype = stochastic_expected_outputs[0][
-                    "values"
-                ].dtype
-            else:
-                # Case 3: Trace only, no expected outputs
-                trace_only_output = np.zeros(self.trace_size, dtype=np.uint8)
-                output_placeholders = [trace_only_output]
-                if self.verbose:
-                    print(
-                        f"Trace-only mode: allocated {self.trace_size} bytes for trace data"
-                    )
+            return self._run_with_trace(
+                npu_kernel,
+                inputs,
+                expected_outputs,
+                stochastic_expected_outputs,
+                rtol,
+                atol,
+                max_mismatch_percentage,
+                trace_file or self.trace_file,
+            )
+
+        # Standard (no-trace) path
+        runtime = get_air_runtime()
+        input_tensors = [_tensor(a) if _has_aie_utils else a for a in inputs]
+        output_tensors = _make_output_tensors(
+            expected_outputs, stochastic_expected_outputs
+        )
+        io_args = input_tensors + output_tensors
+
+        # Build refs dict
+        refs = {len(inputs) + i: exp for i, exp in enumerate(expected_outputs)}
+
+        return runtime.run_test(
+            npu_kernel,
+            io_args,
+            refs=refs,
+            rtol=rtol,
+            atol=atol,
+            max_mismatch_percentage=max_mismatch_percentage,
+            min_correlation=min_correlation,
+            stochastic_refs=stochastic_expected_outputs,
+        )
+
+    def _run_with_trace(
+        self,
+        npu_kernel,
+        inputs,
+        expected_outputs,
+        stochastic_expected_outputs,
+        rtol,
+        atol,
+        max_mismatch_percentage,
+        trace_file,
+    ) -> int:
+        """Handle the trace-enabled execution path."""
+        try:
+            from aie.utils import TraceConfig, HostRuntime
+        except ImportError:
+            raise AirBackendError(
+                "Trace utilities (aie.utils) are not available. "
+                "Install mlir-aie to use trace_size parameter."
+            )
+
+        runtime = get_air_runtime()
+
+        # Build combined tensors for trace path
+        if expected_outputs:
+            total_bytes = expected_outputs[0].nbytes + self.trace_size
+            first_out = np.zeros(total_bytes, dtype=np.uint8)
+            rest_outs = [np.zeros(o.shape, o.dtype) for o in expected_outputs[1:]]
+            output_placeholders = [first_out] + rest_outs
+            expected_outputs_0_shape = expected_outputs[0].shape
+            expected_outputs_0_dtype = expected_outputs[0].dtype
+        elif stochastic_expected_outputs:
+            first_output_elements = np.prod(stochastic_expected_outputs[0]["shape"])
+            first_output_bytes = (
+                first_output_elements
+                * stochastic_expected_outputs[0]["values"][0].dtype.itemsize
+            )
+            total_bytes = first_output_bytes + self.trace_size
+            first_out = np.zeros(total_bytes, dtype=np.uint8)
+            rest_outs = [
+                np.zeros(o["shape"], o["values"][0].dtype)
+                for o in stochastic_expected_outputs[1:]
+            ]
+            output_placeholders = [first_out] + rest_outs
+            expected_outputs_0_shape = stochastic_expected_outputs[0]["shape"]
+            expected_outputs_0_dtype = stochastic_expected_outputs[0]["values"].dtype
         else:
-            # Case 4: No trace, original behavior
-            if expected_outputs:
-                output_placeholders = [
-                    np.zeros(o.shape, o.dtype) for o in expected_outputs
-                ]
-            elif stochastic_expected_outputs:
-                output_placeholders = [
-                    np.zeros(o["shape"], o["values"][0].dtype)
-                    for o in stochastic_expected_outputs
-                ]
-            else:
-                assert (
-                    False
-                ), f"Expect one of 'expected_outputs' and 'stochastic_expected_outputs' to not be empty, or trace_size > 0."
+            trace_only_output = np.zeros(self.trace_size, dtype=np.uint8)
+            output_placeholders = [trace_only_output]
+            expected_outputs_0_shape = None
+            expected_outputs_0_dtype = None
 
-        expanded_inputs = inputs + output_placeholders
+        all_np = inputs + output_placeholders
+        io_args = [_tensor(a) if _has_aie_utils else a for a in all_np]
 
-        compiled_module = backend.compile(mlir_module)
+        handle = runtime.load(npu_kernel)
         with filelock.FileLock("/tmp/npu.lock"):
-            module_function = backend.load(compiled_module)
-            actual_outputs = module_function(*expanded_inputs)
-
-        backend.unload()
+            runtime.run(handle, io_args)
 
-        # Remove input slots from the received outputs first
-        actual_outputs = list(actual_outputs[len(inputs) :])
+        # Extract numpy results
+        actual_outputs_np = [t.numpy() for t in io_args[len(inputs) :]]
 
-        # Handle trace data extraction and saving
-        if self.trace_size > 0:
-            # Import trace utilities only when needed for trace handling
-            try:
-                from aie.utils import TraceConfig, HostRuntime
-            except ImportError:
-                raise AirBackendError(
-                    "Trace utilities (aie.utils) are not available. "
-                    "Trace functionality requires mlir-aie to be installed. "
-                    "Install mlir-aie to use trace_size parameter."
-                )
-
-            actual_outputs[0], trace = HostRuntime._extract_prefix(
-                actual_outputs[0],
+        # Extract trace data
+        if expected_outputs_0_shape is not None:
+            actual_outputs_np[0], trace = HostRuntime._extract_prefix(
+                actual_outputs_np[0],
                 expected_outputs_0_shape,
-                expected_outputs_0_dtype,
+                np.dtype(expected_outputs_0_dtype),
             )
-            trace = trace.view(np.uint32).reshape(self.trace_size // 4)
-            trace_config = TraceConfig(
-                trace_size=self.trace_size, trace_file=active_trace_file
-            )
-            trace_config.write_trace(trace)
+        else:
+            trace = actual_outputs_np[0].view(np.uint8)
 
-            print(f"Trace data ({self.trace_size} bytes) saved to {active_trace_file}")
+        trace = trace.view(np.uint32).reshape(self.trace_size // 4)
+        trace_config = TraceConfig(trace_size=self.trace_size, trace_file=trace_file)
+        trace_config.write_trace(trace)
+        print(f"Trace data ({self.trace_size} bytes) saved to {trace_file}")
 
-        # Perform result checking only if we have expected outputs
-        if expected_outputs and actual_outputs:
-            if self._check_outputs(
-                actual_outputs=actual_outputs,
-                expected_outputs=expected_outputs,
+        # Verify results — wrap numpy arrays as lightweight objects with .numpy()
+        class _NumpyWrap:
+            def __init__(self, arr):
+                self._arr = arr
+
+            def numpy(self):
+                return self._arr
+
+        wrapped = [_NumpyWrap(a) for a in actual_outputs_np]
+
+        if expected_outputs and actual_outputs_np:
+            refs = {i: exp for i, exp in enumerate(expected_outputs)}
+            errors = AirRuntime.verify_results(
+                wrapped,
+                refs=refs,
                 rtol=rtol,
                 atol=atol,
                 max_mismatch_percentage=max_mismatch_percentage,
-                min_correlation=min_correlation,
-            ):
+            )
+            if errors == 0:
                 print("PASS!")
-                return_code = 0
+                return 0
             else:
                 print("failed.")
-                return_code = -1
-        elif stochastic_expected_outputs and actual_outputs:
-            if self._check_outputs_stochastic(
-                actual_outputs=actual_outputs,
-                stochastic_expected_outputs=stochastic_expected_outputs,
+                return -1
+        elif stochastic_expected_outputs and actual_outputs_np:
+            errors = AirRuntime.verify_results(
+                wrapped,
+                refs={},
                 rtol=rtol,
                 atol=atol,
                 max_mismatch_percentage=max_mismatch_percentage,
-            ):
+                stochastic_refs=stochastic_expected_outputs,
+            )
+            if errors == 0:
                 print("PASS!")
-                return_code = 0
+                return 0
             else:
                 print("failed.")
-                return_code = -1
-        elif self.trace_size > 0 and not (
-            expected_outputs or stochastic_expected_outputs
-        ):
-            # Trace-only case
-            print("Trace data extracted successfully!")
-            return_code = 0
+                return -1
         else:
-            print("No outputs to validate.")
-            return_code = 0
-
-        return return_code
-
-    def _check_outputs(
-        self,
-        actual_outputs: List[np.ndarray],
-        expected_outputs: List[np.ndarray],
-        rtol: float = 1e-3,
-        atol: float = 1e-8,
-        max_mismatch_percentage: float = 0,
-        min_correlation: float = None,
-    ):
-        assert len(actual_outputs) == len(
-            expected_outputs
-        ), f"Number of actual outputs ({len(actual_outputs)}) does not equal number of expected outputs ({len(expected_outputs)})"
-        np.set_printoptions(formatter={"int": hex})
-
-        for i, (actual, expected) in enumerate(zip(actual_outputs, expected_outputs)):
-            actual = np.reshape(actual, expected.shape)
-
-            if self.verbose:
-                print("Expected: ")
-                if len(expected.shape) == 2:
-                    print(np.asmatrix(expected))
-                else:
-                    print(expected)
-                print("Actual: ")
-                if len(actual.shape) == 2:
-                    print(np.asmatrix(actual))
-                else:
-                    print(actual)
-
-            if expected.dtype in [np.float16, np.float32, np.float64, bfloat16]:
-                if expected.dtype == bfloat16:
-                    expected = expected.astype(np.float64)
-                    actual = actual.astype(np.float64)
-
-                # Element-wise tolerance check
-                elementwise_ok = True
-                close_mask = np.isclose(actual, expected, rtol=rtol, atol=atol)
-                mismatch_indices = np.where(~close_mask)
-                num_mismatches = len(mismatch_indices[0])
-                total_elements = expected.size
-                max_acceptable = int(total_elements * max_mismatch_percentage / 100)
-                if num_mismatches > max_acceptable:
-                    elementwise_ok = False
-                    print(f"ERROR: Output {i} does not meet expected output.")
-                    print(f"Shape: {expected.shape}")
-                    if total_elements > 0:
-                        print(
-                            f"Mismatches: {num_mismatches} / {total_elements} elements ({100*num_mismatches/total_elements:.2f}%)"
-                        )
-                    else:
-                        print(
-                            f"Mismatches: {num_mismatches} / {total_elements} elements (empty array)"
-                        )
-                    if max_acceptable > 0:
-                        print(
-                            f"Max acceptable: {max_acceptable} ({max_mismatch_percentage}%)"
-                        )
-                    # Show first N mismatches
-                    max_display = 20
-                    print(
-                        f"First {min(max_display, num_mismatches)} mismatched locations:"
-                    )
-                    for j in range(min(max_display, num_mismatches)):
-                        idx = tuple(dim[j] for dim in mismatch_indices)
-                        print(
-                            f"  Index {idx}: expected={expected[idx]}, actual={actual[idx]}, diff={abs(actual[idx] - expected[idx])}"
-                        )
-                    if num_mismatches > max_display:
-                        print(
-                            f"  ... and {num_mismatches - max_display} more mismatches"
-                        )
-
-                # Correlation check (parallel with element-wise)
-                corr_ok = True
-                if min_correlation is not None and total_elements > 0:
-                    corr = float(
-                        np.corrcoef(actual.flatten(), expected.flatten())[0, 1]
-                    )
-                    print(
-                        f"Output {i} correlation: {corr:.6f} "
-                        f"(threshold: {min_correlation})"
-                    )
-                    if not np.isfinite(corr) or corr < min_correlation:
-                        corr_ok = False
-                        print(
-                            f"ERROR: Output {i} correlation {corr:.6f} "
-                            f"below threshold {min_correlation}"
-                        )
-
-                if not elementwise_ok or not corr_ok:
-                    return False
-            else:
-                if not np.array_equal(actual, expected):
-                    print(f"ERROR: Output {i} does not meet expected output.")
-                    # Find mismatched elements
-                    mismatch_mask = actual != expected
-                    mismatch_indices = np.where(mismatch_mask)
-                    num_mismatches = len(mismatch_indices[0])
-                    total_elements = expected.size
-                    print(f"Shape: {expected.shape}")
-                    if total_elements > 0:
-                        print(
-                            f"Mismatches: {num_mismatches} / {total_elements} elements ({100*num_mismatches/total_elements:.2f}%)"
-                        )
-                    else:
-                        print(
-                            f"Mismatches: {num_mismatches} / {total_elements} elements (empty array)"
-                        )
-                    # Show first N mismatches
-                    max_display = 20
-                    print(
-                        f"First {min(max_display, num_mismatches)} mismatched locations:"
-                    )
-                    for j in range(min(max_display, num_mismatches)):
-                        idx = tuple(dim[j] for dim in mismatch_indices)
-                        print(
-                            f"  Index {idx}: expected={expected[idx]}, actual={actual[idx]}"
-                        )
-                    if num_mismatches > max_display:
-                        print(
-                            f"  ... and {num_mismatches - max_display} more mismatches"
-                        )
-                    return False
-
-        return True
-
-    def _check_outputs_stochastic(
-        self,
-        actual_outputs: List[np.ndarray],
-        stochastic_expected_outputs: List[np.ndarray],
-        rtol: float = 1e-3,
-        atol: float = 1e-8,
-        max_mismatch_percentage: float = 0,
-    ):
-        assert len(actual_outputs) == len(
-            stochastic_expected_outputs
-        ), f"Number of actual outputs ({len(actual_outputs)}) does not equal number of expected outputs ({len(stochastic_expected_outputs)})"
-        np.set_printoptions(formatter={"int": hex})
-
-        for i, (actual, expected) in enumerate(
-            zip(actual_outputs, stochastic_expected_outputs)
-        ):
-            actual = np.reshape(actual, expected["shape"])
-
-            if self.verbose:
-                print("Expected: ")
-                if len(expected["shape"]) == 2:
-                    print(np.asmatrix(expected))
-                else:
-                    print("Shape: ", expected["shape"])
-                    print("Indices: ", expected["indices"])
-                    print("Values: ", expected["values"])
-                print("Actual: ")
-                if len(actual.shape) == 2:
-                    print(np.asmatrix(actual))
-                else:
-                    print(actual)
-
-            if expected["values"][0].dtype in [
-                np.float16,
-                np.float32,
-                np.float64,
-                bfloat16,
-            ]:
-                if expected["values"][0].dtype == bfloat16:
-                    expected["values"] = expected["values"].astype(np.float64)
-                    actual = actual.astype(np.float64)
-                actual_stochastic = actual[tuple(expected["indices"])]
-                close_mask = np.isclose(
-                    actual_stochastic, expected["values"], rtol=rtol, atol=atol
-                )
-                mismatch_positions = np.where(~close_mask)[0]
-                num_mismatches = len(mismatch_positions)
-                total_elements = len(expected["values"])
-                max_acceptable = int(total_elements * max_mismatch_percentage / 100)
-                if num_mismatches > max_acceptable:
-                    print(f"ERROR: Output {i} does not meet expected output.")
-                    print(f"Shape: {expected['shape']}")
-                    print(f"Stochastic check: {total_elements} sampled elements")
-                    print(
-                        f"Mismatches: {num_mismatches} / {total_elements} elements ({100*num_mismatches/total_elements:.2f}%)"
-                    )
-                    if max_acceptable > 0:
-                        print(
-                            f"Max acceptable: {max_acceptable} ({max_mismatch_percentage}%)"
-                        )
-                    # Show first N mismatches
-                    max_display = 20
-                    print(
-                        f"First {min(max_display, num_mismatches)} mismatched locations:"
-                    )
-                    for j in range(min(max_display, num_mismatches)):
-                        pos = mismatch_positions[j]
-                        idx = tuple(dim[pos] for dim in expected["indices"])
-                        exp_val = expected["values"][pos]
-                        act_val = actual_stochastic[pos]
-                        print(
-                            f"  Index {idx}: expected={exp_val}, actual={act_val}, diff={abs(act_val - exp_val)}"
-                        )
-                    if num_mismatches > max_display:
-                        print(
-                            f"  ... and {num_mismatches - max_display} more mismatches"
-                        )
-                    return False
-            else:
-                actual_stochastic = actual[tuple(expected["indices"])]
-                if not np.array_equal(actual_stochastic, expected["values"]):
-                    print(f"ERROR: Output {i} does not meet expected output.")
-                    # Find mismatched elements
-                    mismatch_mask = actual_stochastic != expected["values"]
-                    mismatch_positions = np.where(mismatch_mask)[0]
-                    num_mismatches = len(mismatch_positions)
-                    total_elements = len(expected["values"])
-                    print(f"Shape: {expected['shape']}")
-                    print(f"Stochastic check: {total_elements} sampled elements")
-                    if total_elements > 0:
-                        print(
-                            f"Mismatches: {num_mismatches} / {total_elements} elements ({100*num_mismatches/total_elements:.2f}%)"
-                        )
-                    else:
-                        print(
-                            f"Mismatches: {num_mismatches} / {total_elements} elements (empty array)"
-                        )
-                    # Show first N mismatches
-                    max_display = 20
-                    print(
-                        f"First {min(max_display, num_mismatches)} mismatched locations:"
-                    )
-                    for j in range(min(max_display, num_mismatches)):
-                        pos = mismatch_positions[j]
-                        idx = tuple(dim[pos] for dim in expected["indices"])
-                        exp_val = expected["values"][pos]
-                        act_val = actual_stochastic[pos]
-                        print(f"  Index {idx}: expected={exp_val}, actual={act_val}")
-                    if num_mismatches > max_display:
-                        print(
-                            f"  ... and {num_mismatches - max_display} more mismatches"
-                        )
-                    return False
-
-        return True
+            print("Trace data extracted successfully!")
+            return 0
diff --git a/python/air/dialects/_air_ops_ext.py b/python/air/dialects/_air_ops_ext.py
index 0673a7bda..c6c0dedf7 100644
--- a/python/air/dialects/_air_ops_ext.py
+++ b/python/air/dialects/_air_ops_ext.py
@@ -15,6 +15,8 @@
 
 from ..extras import types as T
 from .func import FuncOp, CallOp
+from ._air_enum_gen import MemorySpace as _MemorySpace
+from .affine import apply as _affine_apply
 
 
 def pyint_to_index(i):
@@ -320,6 +322,62 @@ def module_builder_wrapper(*args, **kwargs):
 segment = region_op(Segment, terminator=lambda *_args: SegmentTerminatorOp())
 
 
+def l1_memref_type(shape, element_type):
+    """Create a MemRef type in L1 (per-core scratchpad) memory space."""
+    return MemRefType.get(
+        shape, element_type,
+        memory_space=IntegerAttr.get(T.i32(), _MemorySpace.L1),
+    )
+
+
+def l2_memref_type(shape, element_type):
+    """Create a MemRef type in L2 (segment-shared) memory space."""
+    return MemRefType.get(
+        shape, element_type,
+        memory_space=IntegerAttr.get(T.i32(), _MemorySpace.L2),
+    )
+
+
+def vec_type(size, element_type):
+    """Create a 1D VectorType of given length and element type."""
+    return VectorType.get([size], element_type)
+
+
+def identity_map_attr():
+    """Return a 1D identity AffineMapAttr (the standard transfer_read/write map)."""
+    return AffineMapAttr.get(AffineMap.get_identity(1))
+
+
+def tile_offset_1d(loop_var, tile_idx, tile_n):
+    """
+    Compute the 1D strided-tile offset: loop_var + tile_idx * tile_n.
+
+    Replaces the 12-line AffineMap.get / AffineExpr.get_add / AffineExpr.get_mul
+    / AffineSymbolExpr.get / AffineConstantExpr.get / affine_apply block used
+    in every 1D vectorized example with a 1x2 herd.
+
+    Args:
+        loop_var:  outer loop induction variable (SSA Value)
+        tile_idx:  herd tile index, e.g. _ty          (SSA Value)
+        tile_n:    tile size in elements               (Python int)
+    Returns:
+        SSA Value holding the computed index.
+    """
+    offset_map = AffineMap.get(
+        0, 2,
+        [
+            AffineExpr.get_add(
+                AffineSymbolExpr.get(0),
+                AffineExpr.get_mul(
+                    AffineSymbolExpr.get(1),
+                    AffineConstantExpr.get(tile_n),
+                ),
+            )
+        ],
+    )
+    return _affine_apply(offset_map, [loop_var, tile_idx])
+
+
 def external_func(name, inputs, outputs=None, visibility="private"):
     if outputs is None:
         outputs = []
diff --git a/test/xrt/01_air_to_npu/gen.py b/test/xrt/01_air_to_npu/gen.py
index 5ba44c0a9..87814e76d 100644
--- a/test/xrt/01_air_to_npu/gen.py
+++ b/test/xrt/01_air_to_npu/gen.py
@@ -9,11 +9,12 @@
 from air.compiler.util import run_transform
 import argparse
 
-from air.backend.xrt import XRTBackend
-from air.backend.xrt_runner import XRTRunner
+from air.backend.xrt import compile_air, get_air_runtime
 from air.ir import *
 import air.passmanager
 
+import aie.utils
+import filelock
 import numpy as np
 
 np.random.seed(42)
@@ -156,19 +157,18 @@ def forward(lhs, rhs):
 # Matrix B: (256, 128)
 B = np.random.randint(-10, 10, size=(256, 128), dtype=np.int32)
 C = np.matmul(A, B)
-runner = XRTRunner(
+npu_kernel = compile_air(
+    air_module,
     air_loop_fusion=True,
     omit_while_true_loop=False,
     use_lock_race_condition_fix=True,
     trace_offset=opts.trace_offset,
     trace_size=opts.trace_size,
-    trace_file=opts.trace_file,
     runtime_loop_tiling_sizes=[4, 4],
 )
-exit(
-    runner.run_test(
-        air_module,
-        inputs=[A, B],
-        expected_outputs=[C],
-    )
-)
+runtime = get_air_runtime()
+io_args = [aie.utils.tensor(A), aie.utils.tensor(B)] + [
+    aie.utils.tensor(np.zeros(C.shape, C.dtype))
+]
+refs = {2: C}
+exit(runtime.run_test(npu_kernel, io_args, refs=refs, trace_file=opts.trace_file))
diff --git a/test/xrt/02_mul_shim_1x1/run.py b/test/xrt/02_mul_shim_1x1/run.py
index 404d6f92c..017311848 100644
--- a/test/xrt/02_mul_shim_1x1/run.py
+++ b/test/xrt/02_mul_shim_1x1/run.py
@@ -3,7 +3,7 @@
 
 # RUN: %PYTHON %s | FileCheck %s
 
-import air.backend.xrt as xrt_backend
+from air.backend.xrt import compile_air, get_air_runtime
 from air.dialects.air import *
 from air.dialects.func import FuncOp
 import air.dialects.linalg.opdsl.lang as linalg_lang
@@ -12,6 +12,7 @@
 from air.ir import *
 
 import argparse
+import aie.utils
 import numpy as np
 
 np.random.seed(42)
@@ -113,18 +114,23 @@ def run_test(size, idtype, odtype):
     ref = (input_a * input_b).astype(odtype)
     input_c = np.ones_like(ref)
 
-    backend = xrt_backend.XRTBackend(
+    npu_kernel = compile_air(
+        mlir_module,
         verbose=verbose,
         use_lock_race_condition_fix=True,
         runtime_loop_tiling_sizes=[4, 4],
     )
 
-    # run the module
-    compiled_module = backend.compile(mlir_module)
+    runtime = get_air_runtime()
+    io_args = [
+        aie.utils.tensor(input_a),
+        aie.utils.tensor(input_b),
+        aie.utils.tensor(input_c),
+    ]
     with filelock.FileLock("/tmp/npu.lock"):
-        mul = backend.load(compiled_module)
-        _, _, output_c = mul(input_a, input_b, input_c)
-        backend.unload()
+        handle = runtime.load(npu_kernel)
+        runtime.run(handle, io_args)
+    output_c = io_args[2].numpy()
 
     print("inputA:", input_a)
     print("inputB:", input_b)
diff --git a/test/xrt/03_mul_L1L2_1x1/run.py b/test/xrt/03_mul_L1L2_1x1/run.py
index 57e3ca46b..5f4d511f4 100644
--- a/test/xrt/03_mul_L1L2_1x1/run.py
+++ b/test/xrt/03_mul_L1L2_1x1/run.py
@@ -3,7 +3,7 @@
 
 # RUN: %PYTHON %s | FileCheck %s
 
-import air.backend.xrt as xrt_backend
+from air.backend.xrt import compile_air, get_air_runtime
 from air.dialects.air import *
 from air.dialects.func import FuncOp
 import air.dialects.linalg.opdsl.lang as linalg_lang
@@ -12,6 +12,7 @@
 from air.ir import *
 
 import argparse
+import aie.utils
 import numpy as np
 
 np.random.seed(42)
@@ -151,7 +152,8 @@ def run_test(size, idtype, odtype):
     ref = (input_a * input_b).astype(odtype)
     input_c = np.ones_like(ref)
 
-    backend = xrt_backend.XRTBackend(
+    npu_kernel = compile_air(
+        mlir_module,
         verbose=verbose,
         use_lock_race_condition_fix=True,
         output_format=args.output_format,
@@ -159,12 +161,16 @@ def run_test(size, idtype, odtype):
         runtime_loop_tiling_sizes=[4, 4],
     )
 
-    # run the module
-    compiled_module = backend.compile(mlir_module)
+    runtime = get_air_runtime()
+    io_args = [
+        aie.utils.tensor(input_a),
+        aie.utils.tensor(input_b),
+        aie.utils.tensor(input_c),
+    ]
     with filelock.FileLock("/tmp/npu.lock"):
-        mul = backend.load(compiled_module)
-        _, _, output_c = mul(input_a, input_b, input_c)
-        backend.unload()
+        handle = runtime.load(npu_kernel)
+        runtime.run(handle, io_args)
+    output_c = io_args[2].numpy()
 
     print("inputA:", input_a)
     print("inputB:", input_b)
diff --git a/test/xrt/04_gemm_w_pack/gen.py b/test/xrt/04_gemm_w_pack/gen.py
index 277574d2c..96b6700f9 100644
--- a/test/xrt/04_gemm_w_pack/gen.py
+++ b/test/xrt/04_gemm_w_pack/gen.py
@@ -3,7 +3,7 @@
 # Copyright (C) 2025, Advanced Micro Devices, Inc. All rights reserved.
 # SPDX-License-Identifier: MIT
 
-from air.backend.xrt import XRTBackend
+from air.backend.xrt import compile_air
 from air.ir import *
 import air.passmanager
 
@@ -105,8 +105,8 @@
     # Run compile and load
     ###############################################
 
-    backend = XRTBackend(
+    compile_air(
+        air_module,
         use_lock_race_condition_fix=True,
         runtime_loop_tiling_sizes=[4, 4],
     )
-    backend.compile(air_module)
diff --git a/test/xrt/06_add_shim_bf16/gen.py b/test/xrt/06_add_shim_bf16/gen.py
index c5979c3f8..7d6c6dafc 100644
--- a/test/xrt/06_add_shim_bf16/gen.py
+++ b/test/xrt/06_add_shim_bf16/gen.py
@@ -3,7 +3,7 @@
 # Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
 # SPDX-License-Identifier: MIT
 
-from air.backend.xrt import XRTBackend
+from air.backend.xrt import compile_air
 from air.ir import *
 from air.passmanager import *
 from air.dialects.air import module_builder
@@ -95,9 +95,9 @@ def mul(lhs, rhs, out):
 # Run compile and load
 ###############################################
 
-backend = XRTBackend(
+compile_air(
+    module,
     air_loop_fusion=True,
     use_lock_race_condition_fix=True,
     runtime_loop_tiling_sizes=[4, 4],
 )
-module_function = backend.compile(module)
diff --git a/test/xrt/07_extern_linalg/gen.py b/test/xrt/07_extern_linalg/gen.py
index 843c45259..5c7233965 100644
--- a/test/xrt/07_extern_linalg/gen.py
+++ b/test/xrt/07_extern_linalg/gen.py
@@ -3,7 +3,7 @@
 # Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
 # SPDX-License-Identifier: MIT
 
-from air.backend.xrt import XRTBackend
+from air.backend.xrt import compile_air
 from air.ir import *
 from air.passmanager import *
 from air.dialects.air import module_builder
@@ -100,10 +100,10 @@ def add(lhs, rhs, out):
 # Run compile and load
 ###############################################
 
-backend = XRTBackend(
+compile_air(
+    module,
     lower_linalg_to_func="kernel.o",
     omit_pingpong=True,
     use_lock_race_condition_fix=True,
     runtime_loop_tiling_sizes=[4, 4],
 )
-module_function = backend.compile(module)
diff --git a/test/xrt/08_gemm_extern_vec/gen.py b/test/xrt/08_gemm_extern_vec/gen.py
index b7221169f..fbbdcd7df 100644
--- a/test/xrt/08_gemm_extern_vec/gen.py
+++ b/test/xrt/08_gemm_extern_vec/gen.py
@@ -3,7 +3,7 @@
 # Copyright (C) 2025, Advanced Micro Devices, Inc. All rights reserved.
 # SPDX-License-Identifier: MIT
 
-from air.backend.xrt import XRTBackend
+from air.backend.xrt import compile_air
 from air.ir import *
 import air.passmanager
 
@@ -123,9 +123,9 @@
     # Run compile and load
     ###############################################
 
-    backend = XRTBackend(
+    compile_air(
+        air_module,
         air_loop_fusion=True,
         use_lock_race_condition_fix=True,
         runtime_loop_tiling_sizes=[4, 4],
     )
-    backend.compile(air_module)
diff --git a/test/xrt/09_gemm_extern_vec_4x4/gen.py b/test/xrt/09_gemm_extern_vec_4x4/gen.py
index 846c8901f..4ab683cd6 100644
--- a/test/xrt/09_gemm_extern_vec_4x4/gen.py
+++ b/test/xrt/09_gemm_extern_vec_4x4/gen.py
@@ -3,7 +3,7 @@
 # Copyright (C) 2025, Advanced Micro Devices, Inc. All rights reserved.
 # SPDX-License-Identifier: MIT
 
-from air.backend.xrt import XRTBackend
+from air.backend.xrt import compile_air
 from air.ir import *
 import air.passmanager
 
@@ -124,9 +124,9 @@
     # Run compile and load
     ###############################################
 
-    backend = XRTBackend(
+    compile_air(
+        air_module,
         air_loop_fusion=True,
         use_lock_race_condition_fix=True,
         runtime_loop_tiling_sizes=[4, 4],
     )
-    backend.compile(air_module)
diff --git a/test/xrt/10_gemm_peeling_extern_vec/gen.py b/test/xrt/10_gemm_peeling_extern_vec/gen.py
index efe11e766..a5b99786f 100644
--- a/test/xrt/10_gemm_peeling_extern_vec/gen.py
+++ b/test/xrt/10_gemm_peeling_extern_vec/gen.py
@@ -3,7 +3,7 @@
 # Copyright (C) 2025, Advanced Micro Devices, Inc. All rights reserved.
 # SPDX-License-Identifier: MIT
 
-from air.backend.xrt import XRTBackend
+from air.backend.xrt import compile_air
 from air.ir import *
 import air.passmanager
 
@@ -136,8 +136,8 @@
     # Run compile and load
     ###############################################
 
-    backend = XRTBackend(
+    compile_air(
+        air_module,
         use_lock_race_condition_fix=True,
         runtime_loop_tiling_sizes=[4, 4],
     )
-    backend.compile(air_module)
diff --git a/test/xrt/11_gemm_bias_fusion/gen.py b/test/xrt/11_gemm_bias_fusion/gen.py
index 880346c49..1bddee696 100644
--- a/test/xrt/11_gemm_bias_fusion/gen.py
+++ b/test/xrt/11_gemm_bias_fusion/gen.py
@@ -3,7 +3,7 @@
 # Copyright (C) 2025, Advanced Micro Devices, Inc. All rights reserved.
 # SPDX-License-Identifier: MIT
 
-from air.backend.xrt import XRTBackend
+from air.backend.xrt import compile_air
 from air.ir import *
 import air.passmanager
 
@@ -187,10 +187,12 @@
     # Run compile and load
     ###############################################
 
-    backend = XRTBackend(
+    compile_air(
+        air_module,
         air_loop_fusion=True,
         omit_auto_broadcast=True,
-        channel_multiplexing=["L1",],
+        channel_multiplexing=[
+            "L1",
+        ],
         runtime_loop_tiling_sizes=[2, 2],
     )
-    backend.compile(air_module)
diff --git a/test/xrt/12_matmul_transform_1x4_bf16/gen.py b/test/xrt/12_matmul_transform_1x4_bf16/gen.py
index 45119bdb7..1163ef67d 100644
--- a/test/xrt/12_matmul_transform_1x4_bf16/gen.py
+++ b/test/xrt/12_matmul_transform_1x4_bf16/gen.py
@@ -9,7 +9,7 @@
 import air.passmanager
 from air._mlir_libs._air import run_transform
 from air.dialects.air import module_builder
-from air.backend.xrt import XRTBackend
+from air.backend.xrt import compile_air
 
 import argparse
 
@@ -132,10 +132,10 @@ def forward(lhs, rhs, out):
 # Run compile and load
 ###############################################
 
-backend = XRTBackend(
+compile_air(
+    air_module,
     air_loop_fusion=True,
     runtime_loop_tiling_sizes=[1, 1],
     lower_linalg_to_func="kernel.o",
     use_lock_race_condition_fix=True,
 )
-backend.compile(air_module)
diff --git a/test/xrt/13_conv2d_i32/gen.py b/test/xrt/13_conv2d_i32/gen.py
index 7ba7bf0a6..8cfa053a8 100644
--- a/test/xrt/13_conv2d_i32/gen.py
+++ b/test/xrt/13_conv2d_i32/gen.py
@@ -3,7 +3,7 @@
 # Copyright (C) 2025, Advanced Micro Devices, Inc. All rights reserved.
 # SPDX-License-Identifier: MIT
 
-from air.backend.xrt import XRTBackend
+from air.backend.xrt import compile_air
 from air.ir import *
 import air.passmanager
 
@@ -100,8 +100,8 @@
     # Run compile and load
     ###############################################
 
-    backend = XRTBackend(
+    compile_air(
+        air_module,
         runtime_loop_tiling_sizes=[1, 1],
         use_lock_race_condition_fix=True,
     )
-    backend.compile(air_module)
diff --git a/test/xrt/14_conv2d_i8_extern_vec/gen.py b/test/xrt/14_conv2d_i8_extern_vec/gen.py
index eee6004af..8eacf5ec6 100644
--- a/test/xrt/14_conv2d_i8_extern_vec/gen.py
+++ b/test/xrt/14_conv2d_i8_extern_vec/gen.py
@@ -3,7 +3,7 @@
 # Copyright (C) 2025, Advanced Micro Devices, Inc. All rights reserved.
 # SPDX-License-Identifier: MIT
 
-from air.backend.xrt import XRTBackend
+from air.backend.xrt import compile_air
 from air.ir import *
 import air.passmanager
 import argparse
@@ -116,11 +116,11 @@
     # Run compile and load
     ###############################################
 
-    backend = XRTBackend(
+    compile_air(
+        air_module,
         lower_linalg_to_func="conv.o",
         trace_offset=opts.trace_offset,
         trace_size=opts.trace_size,
         runtime_loop_tiling_sizes=[1, 1],
         use_lock_race_condition_fix=True,
     )
-    backend.compile(air_module)
diff --git a/test/xrt/15_gemm_peeling_extern_vec_4x4_bf16/gen.py b/test/xrt/15_gemm_peeling_extern_vec_4x4_bf16/gen.py
index 871d846ca..3c80645ee 100644
--- a/test/xrt/15_gemm_peeling_extern_vec_4x4_bf16/gen.py
+++ b/test/xrt/15_gemm_peeling_extern_vec_4x4_bf16/gen.py
@@ -3,7 +3,7 @@
 # Copyright (C) 2025, Advanced Micro Devices, Inc. All rights reserved.
 # SPDX-License-Identifier: MIT
 
-from air.backend.xrt import XRTBackend
+from air.backend.xrt import compile_air
 from air.ir import *
 import air.passmanager
 
@@ -159,9 +159,9 @@
     # Run compile and load
     ###############################################
 
-    backend = XRTBackend(
+    compile_air(
+        air_module,
         lower_linalg_to_func="mm.o",
         use_lock_race_condition_fix=True,
         runtime_loop_tiling_sizes=[4, 4],
     )
-    backend.compile(air_module)
diff --git a/test/xrt/16_gemm_peeling_extern_vec_4x4_bf16_packet/gen.py b/test/xrt/16_gemm_peeling_extern_vec_4x4_bf16_packet/gen.py
index 54bd81322..eaab080e4 100644
--- a/test/xrt/16_gemm_peeling_extern_vec_4x4_bf16_packet/gen.py
+++ b/test/xrt/16_gemm_peeling_extern_vec_4x4_bf16_packet/gen.py
@@ -3,7 +3,7 @@
 # Copyright (C) 2025, Advanced Micro Devices, Inc. All rights reserved.
 # SPDX-License-Identifier: MIT
 
-from air.backend.xrt import XRTBackend
+from air.backend.xrt import compile_air
 from air.ir import *
 import air.passmanager
 
@@ -159,9 +159,9 @@
 # Run compile and load
 ###############################################
 
-backend = XRTBackend(
+compile_air(
+    air_module,
     lower_linalg_to_func="mm.o",
     use_lock_race_condition_fix=True,
     runtime_loop_tiling_sizes=[4, 4],
 )
-backend.compile(air_module)
diff --git a/test/xrt/17_gemm_8x16_transform_vec_4x4/gen.py b/test/xrt/17_gemm_8x16_transform_vec_4x4/gen.py
index f3157db7e..9082b6b03 100644
--- a/test/xrt/17_gemm_8x16_transform_vec_4x4/gen.py
+++ b/test/xrt/17_gemm_8x16_transform_vec_4x4/gen.py
@@ -3,7 +3,7 @@
 # Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
 # SPDX-License-Identifier: MIT
 
-from air.backend.xrt import XRTBackend
+from air.backend.xrt import compile_air
 from air.ir import *
 import air.passmanager
 
@@ -135,10 +135,10 @@
 # Run compile and load
 ###############################################
 
-backend = XRTBackend(
+compile_air(
+    air_module,
     air_loop_fusion=True,
     lower_linalg_to_func="mm.o",
     runtime_loop_tiling_sizes=[1, 1],
     use_lock_race_condition_fix=True,
 )
-backend.compile(air_module)
diff --git a/test/xrt/18_matmul_8x16_shim_transform_bf16/gen.py b/test/xrt/18_matmul_8x16_shim_transform_bf16/gen.py
index 16b7a6e51..09d22e61d 100644
--- a/test/xrt/18_matmul_8x16_shim_transform_bf16/gen.py
+++ b/test/xrt/18_matmul_8x16_shim_transform_bf16/gen.py
@@ -1,11 +1,10 @@
 # Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
 # SPDX-License-Identifier: MIT
 
-from air.backend.xrt import XRTBackend
+from air.backend.xrt import compile_air
 from air.ir import *
 import air.passmanager
 
-
 ################################################
 ## Input SCF and Linalg IR
 ################################################
@@ -111,9 +110,9 @@
 # Run compile and load
 ###############################################
 
-backend = XRTBackend(
+compile_air(
+    air_module,
     air_loop_fusion=True,
     lower_linalg_to_func="kernel.o",
     runtime_loop_tiling_sizes=[2, 2],
 )
-backend.compile(air_module)
diff --git a/test/xrt/19_matmul_8x16_core_transform_bf16/gen.py b/test/xrt/19_matmul_8x16_core_transform_bf16/gen.py
index 790927e04..b1205fd6d 100644
--- a/test/xrt/19_matmul_8x16_core_transform_bf16/gen.py
+++ b/test/xrt/19_matmul_8x16_core_transform_bf16/gen.py
@@ -1,11 +1,10 @@
 # Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
 # SPDX-License-Identifier: MIT
 
-from air.backend.xrt import XRTBackend
+from air.backend.xrt import compile_air
 from air.ir import *
 import air.passmanager
 
-
 ################################################
 ## Input SCF and Linalg IR
 ################################################
@@ -109,9 +108,9 @@
 # Run compile and load
 ###############################################
 
-backend = XRTBackend(
+compile_air(
+    air_module,
     air_loop_fusion=True,
     lower_linalg_to_func="kernel.o",
     runtime_loop_tiling_sizes=[4, 4],
 )
-backend.compile(air_module)
diff --git a/test/xrt/20_batch_matmul_i32/gen.py b/test/xrt/20_batch_matmul_i32/gen.py
index 566c53e3a..01394299c 100644
--- a/test/xrt/20_batch_matmul_i32/gen.py
+++ b/test/xrt/20_batch_matmul_i32/gen.py
@@ -3,7 +3,7 @@
 # Copyright (C) 2025, Advanced Micro Devices, Inc. All rights reserved.
 # SPDX-License-Identifier: MIT
 
-from air.backend.xrt import XRTBackend
+from air.backend.xrt import compile_air
 from air.ir import *
 import air.passmanager
 
@@ -159,8 +159,8 @@
     # Run compile and load
     ###############################################
 
-    backend = XRTBackend(
+    compile_air(
+        air_module,
         air_loop_fusion=True,
         runtime_loop_tiling_sizes=[8, 1],  # Note: [4, 4] gives numeric error. Why?
     )
-    backend.compile(air_module)
diff --git a/test/xrt/21_conv2d_depthwise_i32/gen.py b/test/xrt/21_conv2d_depthwise_i32/gen.py
index 0b0b178db..91f9b7b99 100644
--- a/test/xrt/21_conv2d_depthwise_i32/gen.py
+++ b/test/xrt/21_conv2d_depthwise_i32/gen.py
@@ -3,7 +3,7 @@
 # Copyright (C) 2025, Advanced Micro Devices, Inc. All rights reserved.
 # SPDX-License-Identifier: MIT
 
-from air.backend.xrt import XRTBackend
+from air.backend.xrt import compile_air
 from air.ir import *
 import air.passmanager
 
@@ -96,8 +96,8 @@
     # Run compile and load
     ###############################################
 
-    backend = XRTBackend(
+    compile_air(
+        air_module,
         air_loop_fusion=True,
         runtime_loop_tiling_sizes=[2, 4],
     )
-    backend.compile(air_module)
diff --git a/test/xrt/22_conv2d_stride2_i32/gen.py b/test/xrt/22_conv2d_stride2_i32/gen.py
index 4dc4deab7..c4ec60091 100644
--- a/test/xrt/22_conv2d_stride2_i32/gen.py
+++ b/test/xrt/22_conv2d_stride2_i32/gen.py
@@ -3,7 +3,7 @@
 # Copyright (C) 2025, Advanced Micro Devices, Inc. All rights reserved.
 # SPDX-License-Identifier: MIT
 
-from air.backend.xrt import XRTBackend
+from air.backend.xrt import compile_air
 from air.ir import *
 import air.passmanager
 
@@ -101,8 +101,8 @@
     # Run compile and load
     ###############################################
 
-    backend = XRTBackend(
+    compile_air(
+        air_module,
         runtime_loop_tiling_sizes=[1, 1],
         use_lock_race_condition_fix=True,
     )
-    backend.compile(air_module)
diff --git a/test/xrt/25_batch_matmul_bf16/gen.py b/test/xrt/25_batch_matmul_bf16/gen.py
index a6032fcdd..ed60bd207 100644
--- a/test/xrt/25_batch_matmul_bf16/gen.py
+++ b/test/xrt/25_batch_matmul_bf16/gen.py
@@ -3,7 +3,7 @@
 # Copyright (C) 2025, Advanced Micro Devices, Inc. All rights reserved.
 # SPDX-License-Identifier: MIT
 
-from air.backend.xrt import XRTBackend
+from air.backend.xrt import compile_air
 from air.ir import *
 import air.passmanager
 
@@ -159,9 +159,9 @@
     # Run compile and load
     ###############################################
 
-    backend = XRTBackend(
+    compile_air(
+        air_module,
         lower_linalg_to_func="mm.o",
         air_loop_fusion=True,
         runtime_loop_tiling_sizes=[1, 1],
     )
-    backend.compile(air_module)
diff --git a/test/xrt/26_vecmat_i8/gen.py b/test/xrt/26_vecmat_i8/gen.py
index 0b1d3514b..ef0951a9f 100644
--- a/test/xrt/26_vecmat_i8/gen.py
+++ b/test/xrt/26_vecmat_i8/gen.py
@@ -3,7 +3,7 @@
 # Copyright (C) 2025, Advanced Micro Devices, Inc. All rights reserved.
 # SPDX-License-Identifier: MIT
 
-from air.backend.xrt import XRTBackend
+from air.backend.xrt import compile_air
 from air.ir import *
 import air.passmanager
 
@@ -120,9 +120,9 @@
     # Run compile and load
     ###############################################
 
-    backend = XRTBackend(
+    compile_air(
+        air_module,
         lower_linalg_to_func="vm.o",
         use_lock_race_condition_fix=True,
         runtime_loop_tiling_sizes=[4, 4],
     )
-    backend.compile(air_module)
diff --git a/test/xrt/27_gemm_peeling_extern_vec_4x4_i32/gen.py b/test/xrt/27_gemm_peeling_extern_vec_4x4_i32/gen.py
index 582e9066e..bd515b928 100644
--- a/test/xrt/27_gemm_peeling_extern_vec_4x4_i32/gen.py
+++ b/test/xrt/27_gemm_peeling_extern_vec_4x4_i32/gen.py
@@ -3,7 +3,7 @@
 # Copyright (C) 2025, Advanced Micro Devices, Inc. All rights reserved.
 # SPDX-License-Identifier: MIT
 
-from air.backend.xrt import XRTBackend
+from air.backend.xrt import compile_air
 from air.ir import *
 import air.passmanager
 
@@ -160,8 +160,8 @@
     # Run compile and load
     ###############################################
 
-    backend = XRTBackend(
+    compile_air(
+        air_module,
         runtime_loop_tiling_sizes=[2, 2],
         use_lock_race_condition_fix=True,
     )
-    backend.compile(air_module)
diff --git a/test/xrt/28_gemm_loop_nest_bf16/gen.py b/test/xrt/28_gemm_loop_nest_bf16/gen.py
index 065e4c52c..461803ee9 100644
--- a/test/xrt/28_gemm_loop_nest_bf16/gen.py
+++ b/test/xrt/28_gemm_loop_nest_bf16/gen.py
@@ -3,7 +3,7 @@
 # Copyright (C) 2025, Advanced Micro Devices, Inc. All rights reserved.
 # SPDX-License-Identifier: MIT
 
-from air.backend.xrt import XRTBackend
+from air.backend.xrt import compile_air
 from air.ir import *
 import air.passmanager
 
@@ -121,10 +121,10 @@
     # Run compile and load
     ###############################################
 
-    backend = XRTBackend(
+    compile_air(
+        air_module,
         omit_pingpong=True,
         lower_linalg_to_func="mm.o",
         use_lock_race_condition_fix=True,
         runtime_loop_tiling_sizes=[4, 4],
     )
-    backend.compile(air_module)
diff --git a/test/xrt/29_gemm_4_level_tiling_extern_vec_4x4_bf16/gen.py b/test/xrt/29_gemm_4_level_tiling_extern_vec_4x4_bf16/gen.py
index f95f2c5d5..c22a2ce7b 100644
--- a/test/xrt/29_gemm_4_level_tiling_extern_vec_4x4_bf16/gen.py
+++ b/test/xrt/29_gemm_4_level_tiling_extern_vec_4x4_bf16/gen.py
@@ -3,7 +3,7 @@
 # Copyright (C) 2025, Advanced Micro Devices, Inc. All rights reserved.
 # SPDX-License-Identifier: MIT
 
-from air.backend.xrt import XRTBackend
+from air.backend.xrt import compile_air
 from air.ir import *
 import air.passmanager
 
@@ -130,9 +130,9 @@
     # Run compile and load
     ###############################################
 
-    backend = XRTBackend(
+    compile_air(
+        air_module,
         omit_pingpong=True,
         lower_linalg_to_func="mm.o",
         runtime_loop_tiling_sizes=[4, 4],
     )
-    backend.compile(air_module)
diff --git a/test/xrt/30_mul_rtp_1x1/run.py b/test/xrt/30_mul_rtp_1x1/run.py
index 4356df1d7..bb55c8425 100644
--- a/test/xrt/30_mul_rtp_1x1/run.py
+++ b/test/xrt/30_mul_rtp_1x1/run.py
@@ -3,7 +3,7 @@
 
 # RUN: %PYTHON %s | FileCheck %s
 
-import air.backend.xrt as xrt_backend
+from air.backend.xrt import compile_air, get_air_runtime
 from air.dialects.air import *
 from air.dialects.func import FuncOp, ReturnOp
 import air.dialects.linalg.opdsl.lang as linalg_lang
@@ -12,6 +12,7 @@
 from air.ir import *
 
 import argparse
+import aie.utils
 import numpy as np
 
 np.random.seed(42)
@@ -115,20 +116,25 @@ def run_test(size, idtype, odtype):
     ref = (input_a * input_b).astype(odtype)
     input_c = np.ones_like(ref)
 
-    backend = xrt_backend.XRTBackend(
+    npu_kernel = compile_air(
+        mlir_module,
         omit_pingpong=True,
         verbose=verbose,
         use_lock_race_condition_fix=True,
         runtime_loop_tiling_sizes=[4, 4],
     )
 
-    # run the module
+    runtime = get_air_runtime()
+    io_args = [
+        aie.utils.tensor(input_a),
+        aie.utils.tensor(input_b),
+        aie.utils.tensor(input_c),
+    ]
     with filelock.FileLock("/tmp/npu.lock"):
-        mul = backend.compile_and_load(mlir_module)
+        handle = runtime.load(npu_kernel)
         print("running")
-        _, _, output_c = mul(input_a, input_b, input_c)
-
-    backend.unload()
+        runtime.run(handle, io_args)
+    output_c = io_args[2].numpy()
 
     print("inputA:", input_a)
     print("inputB:", input_b)
diff --git a/test/xrt/31_triton_blk_ptr_eltwise_mul/run.py b/test/xrt/31_triton_blk_ptr_eltwise_mul/run.py
index ff91dad5b..3eb462264 100644
--- a/test/xrt/31_triton_blk_ptr_eltwise_mul/run.py
+++ b/test/xrt/31_triton_blk_ptr_eltwise_mul/run.py
@@ -4,11 +4,10 @@
 # SPDX-License-Identifier: MIT
 
 import argparse
-from air.backend.xrt import XRTBackend
-from air.backend.xrt_runner import XRTRunner
+from air.backend.xrt import compile_air, get_air_runtime
 from air.ir import *
 import air.passmanager
-import filelock
+import aie.utils
 
 import numpy as np
 
@@ -104,19 +103,20 @@
     inputs_b = (np.random.rand(*input_size)).reshape(input_size).astype(input_type)
     ref = (inputs_a * inputs_b).astype(input_type)
 
-    ###### Compile and test
-    runner = XRTRunner(
+    npu_kernel = compile_air(
+        air_module,
         omit_while_true_loop=False,
         use_lock_race_condition_fix=True,
         output_format=args.output_format,
         instance_name="kernel",
         runtime_loop_tiling_sizes=[4, 4],
     )
-    exit(
-        runner.run_test(
-            air_module,
-            inputs=[inputs_a, inputs_b],
-            expected_outputs=[ref],
-            rtol=1e-3,
-        )
-    )
+
+    runtime = get_air_runtime()
+    io_args = [
+        aie.utils.tensor(inputs_a),
+        aie.utils.tensor(inputs_b),
+        aie.utils.tensor(np.zeros(ref.shape, ref.dtype)),
+    ]
+    refs = {2: ref}
+    exit(runtime.run_test(npu_kernel, io_args, refs=refs, rtol=1e-3))
diff --git a/test/xrt/32_triton_matmul/run.py b/test/xrt/32_triton_matmul/run.py
index 15578d5ea..7f1f3e1b1 100644
--- a/test/xrt/32_triton_matmul/run.py
+++ b/test/xrt/32_triton_matmul/run.py
@@ -3,12 +3,11 @@
 # Copyright (C) 2025, Advanced Micro Devices, Inc. All rights reserved.
 # SPDX-License-Identifier: MIT
 
-from air.backend.xrt import XRTBackend
-from air.backend.xrt_runner import XRTRunner
+from air.backend.xrt import compile_air, get_air_runtime
 from air.compiler.util import run_transform
 from air.ir import *
 import air.passmanager
-import filelock
+import aie.utils
 
 import numpy as np
 
@@ -121,17 +120,18 @@
     B = np.random.rand(K, N).astype(input_type)  # Shape [K, N]
     C = np.matmul(A, B).astype(input_type)  # Shape [M, N]
 
-    ###### Compile and test
-    runner = XRTRunner(
+    npu_kernel = compile_air(
+        air_module,
         omit_while_true_loop=False,
         use_lock_race_condition_fix=True,
         runtime_loop_tiling_sizes=[4, 4],
     )
-    exit(
-        runner.run_test(
-            air_module,
-            inputs=[A, B],
-            expected_outputs=[C],
-            rtol=1e-3,
-        )
-    )
+
+    runtime = get_air_runtime()
+    io_args = [
+        aie.utils.tensor(A),
+        aie.utils.tensor(B),
+        aie.utils.tensor(np.zeros(C.shape, C.dtype)),
+    ]
+    refs = {2: C}
+    exit(runtime.run_test(npu_kernel, io_args, refs=refs, rtol=1e-3))
diff --git a/test/xrt/33_triton_matmul_ver2/run.py b/test/xrt/33_triton_matmul_ver2/run.py
index b859be202..dc54545cc 100644
--- a/test/xrt/33_triton_matmul_ver2/run.py
+++ b/test/xrt/33_triton_matmul_ver2/run.py
@@ -4,12 +4,11 @@
 # SPDX-License-Identifier: MIT
 
 import argparse
-from air.backend.xrt import XRTBackend
-from air.backend.xrt_runner import XRTRunner
+from air.backend.xrt import compile_air, get_air_runtime
 from air.compiler.util import run_transform
 from air.ir import *
 import air.passmanager
-import filelock
+import aie.utils
 
 import numpy as np
 
@@ -106,19 +105,20 @@
     B = np.random.rand(K, N).astype(input_type)  # Shape [K, N]
     C = np.matmul(A, B).astype(input_type)  # Shape [M, N]
 
-    ###### Compile and test
-    runner = XRTRunner(
+    npu_kernel = compile_air(
+        air_module,
         omit_while_true_loop=False,
         use_lock_race_condition_fix=True,
         runtime_loop_tiling_sizes=[4, 4],
         output_format=args.output_format,
         instance_name="bare_matmul",
     )
-    exit(
-        runner.run_test(
-            air_module,
-            inputs=[A, B],
-            expected_outputs=[C],
-            rtol=1e-3,
-        )
-    )
+
+    runtime = get_air_runtime()
+    io_args = [
+        aie.utils.tensor(A),
+        aie.utils.tensor(B),
+        aie.utils.tensor(np.zeros(C.shape, C.dtype)),
+    ]
+    refs = {2: C}
+    exit(runtime.run_test(npu_kernel, io_args, refs=refs, rtol=1e-3))
diff --git a/test/xrt/34_cascade_vecadd/run_chess.py b/test/xrt/34_cascade_vecadd/run_chess.py
index b5429a096..610f96093 100644
--- a/test/xrt/34_cascade_vecadd/run_chess.py
+++ b/test/xrt/34_cascade_vecadd/run_chess.py
@@ -13,8 +13,8 @@
 import argparse
 import sys
 
-from air.backend.xrt_runner import XRTRunner
-from air.backend.xrt import XRTBackend
+from air.backend.xrt import compile_air, get_air_runtime
+import aie.utils
 
 import numpy as np
 
@@ -97,44 +97,40 @@
     ###############################################
 
     input_a = np.arange(0, 2048, dtype=np.int32)
-    if args.compile_mode == "compile-and-run":
-        num_samples = 100
-        sampled_indices = np.vstack([np.random.randint(0, 2048, num_samples)])
-
-        # Compute reference results for sampled indices
-        sampled_values = np.array(
-            [input_a[i] + 4 for i in zip(*sampled_indices)],
-            dtype=np.int32,
-        )
-
-        # Store as a dictionary
-        sampled_data = {
-            "shape": (2048),
-            "indices": sampled_indices,
-            "values": sampled_values,
-        }
 
-        ###### Compile and test
-        runner = XRTRunner(
-            verbose=args.verbose,
-            omit_while_true_loop=False,
-            runtime_loop_tiling_sizes=[4, 4],
-        )
-        exit(
-            runner.run_test(
-                air_module,
-                inputs=[input_a],
-                stochastic_expected_outputs=[sampled_data],
-            )
-        )
-
-    elif args.compile_mode == "compile-only":
-        ###### Compile only
-        backend = XRTBackend(
-            verbose=args.verbose,
-            omit_while_true_loop=False,
-            runtime_loop_tiling_sizes=[4, 4],
-        )
-        module_function = backend.compile(mlir_module)
-
-        backend.unload()
+    npu_kernel = compile_air(
+        air_module,
+        verbose=args.verbose,
+        omit_while_true_loop=False,
+        runtime_loop_tiling_sizes=[4, 4],
+    )
+
+    if args.compile_mode == "compile-only":
+        exit(0)
+
+    num_samples = 100
+    sampled_indices = np.vstack([np.random.randint(0, 2048, num_samples)])
+
+    # Compute reference results for sampled indices
+    sampled_values = np.array(
+        [input_a[i] + 4 for i in zip(*sampled_indices)],
+        dtype=np.int32,
+    )
+
+    # Store as a dictionary
+    sampled_data = {
+        "shape": (2048),
+        "indices": sampled_indices,
+        "values": sampled_values,
+    }
+
+    runtime = get_air_runtime()
+    dtype = sampled_data["values"].dtype
+    shape = sampled_data["shape"]
+    if isinstance(shape, int):
+        shape = (shape,)
+    io_args = [
+        aie.utils.tensor(input_a),
+        aie.utils.tensor(np.zeros(shape, dtype)),
+    ]
+    exit(runtime.run_test(npu_kernel, io_args, refs={}, stochastic_refs=[sampled_data]))
diff --git a/test/xrt/34_cascade_vecadd/run_peano.py b/test/xrt/34_cascade_vecadd/run_peano.py
index 3c28274a1..cb015b2df 100644
--- a/test/xrt/34_cascade_vecadd/run_peano.py
+++ b/test/xrt/34_cascade_vecadd/run_peano.py
@@ -13,8 +13,8 @@
 import argparse
 import sys
 
-from air.backend.xrt_runner import XRTRunner
-from air.backend.xrt import XRTBackend
+from air.backend.xrt import compile_air, get_air_runtime
+import aie.utils
 
 import numpy as np
 
@@ -100,44 +100,40 @@
     ###############################################
 
     input_a = np.arange(0, 2048, dtype=np.int32)
-    if args.compile_mode == "compile-and-run":
-        num_samples = 100
-        sampled_indices = np.vstack([np.random.randint(0, 2048, num_samples)])
-
-        # Compute reference results for sampled indices
-        sampled_values = np.array(
-            [input_a[i] + 4 for i in zip(*sampled_indices)],
-            dtype=np.int32,
-        )
-
-        # Store as a dictionary
-        sampled_data = {
-            "shape": (2048),
-            "indices": sampled_indices,
-            "values": sampled_values,
-        }
 
-        ###### Compile and test
-        runner = XRTRunner(
-            verbose=args.verbose,
-            omit_while_true_loop=False,
-            runtime_loop_tiling_sizes=[4, 4],
-        )
-        exit(
-            runner.run_test(
-                air_module,
-                inputs=[input_a],
-                stochastic_expected_outputs=[sampled_data],
-            )
-        )
-
-    elif args.compile_mode == "compile-only":
-        ###### Compile only
-        backend = XRTBackend(
-            verbose=args.verbose,
-            omit_while_true_loop=False,
-            runtime_loop_tiling_sizes=[4, 4],
-        )
-        module_function = backend.compile(mlir_module)
-
-        backend.unload()
+    npu_kernel = compile_air(
+        air_module,
+        verbose=args.verbose,
+        omit_while_true_loop=False,
+        runtime_loop_tiling_sizes=[4, 4],
+    )
+
+    if args.compile_mode == "compile-only":
+        exit(0)
+
+    num_samples = 100
+    sampled_indices = np.vstack([np.random.randint(0, 2048, num_samples)])
+
+    # Compute reference results for sampled indices
+    sampled_values = np.array(
+        [input_a[i] + 4 for i in zip(*sampled_indices)],
+        dtype=np.int32,
+    )
+
+    # Store as a dictionary
+    sampled_data = {
+        "shape": (2048),
+        "indices": sampled_indices,
+        "values": sampled_values,
+    }
+
+    runtime = get_air_runtime()
+    dtype = sampled_data["values"].dtype
+    shape = sampled_data["shape"]
+    if isinstance(shape, int):
+        shape = (shape,)
+    io_args = [
+        aie.utils.tensor(input_a),
+        aie.utils.tensor(np.zeros(shape, dtype)),
+    ]
+    exit(runtime.run_test(npu_kernel, io_args, refs={}, stochastic_refs=[sampled_data]))
diff --git a/test/xrt/35_herd_reduce/run.py b/test/xrt/35_herd_reduce/run.py
index 4e76bd7ff..e7348a073 100644
--- a/test/xrt/35_herd_reduce/run.py
+++ b/test/xrt/35_herd_reduce/run.py
@@ -13,8 +13,8 @@
 import argparse
 import sys
 
-from air.backend.xrt_runner import XRTRunner
-from air.backend.xrt import XRTBackend
+from air.backend.xrt import compile_air, get_air_runtime
+import aie.utils
 
 import numpy as np
 
@@ -106,48 +106,42 @@
     ###############################################
 
     input_a = np.arange(0, 2048, dtype=np.int32)
-    if args.compile_mode == "compile-and-run":
-        num_samples = 100
-        sampled_indices = np.vstack([np.random.randint(0, 2048, num_samples)])
-
-        # Compute reference results for sampled indices
-        sampled_values = np.array(
-            [input_a[i] + 4 for i in zip(*sampled_indices)],
-            dtype=np.int32,
-        )
-
-        # Store as a dictionary
-        sampled_data = {
-            "shape": (2048),
-            "indices": sampled_indices,
-            "values": sampled_values,
-        }
 
-        ###### Compile and test
-        runner = XRTRunner(
-            verbose=args.verbose,
-            omit_while_true_loop=False,
-            output_format=args.output_format,
-            instance_name="scf1",
-            runtime_loop_tiling_sizes=[4, 4],
-        )
-        exit(
-            runner.run_test(
-                air_module,
-                inputs=[input_a],
-                stochastic_expected_outputs=[sampled_data],
-            )
-        )
-
-    elif args.compile_mode == "compile-only":
-        ###### Compile only
-        backend = XRTBackend(
-            verbose=args.verbose,
-            omit_while_true_loop=False,
-            output_format=args.output_format,
-            instance_name="scf1",
-            runtime_loop_tiling_sizes=[4, 4],
-        )
-        module_function = backend.compile(mlir_module)
-
-        backend.unload()
+    npu_kernel = compile_air(
+        air_module,
+        verbose=args.verbose,
+        omit_while_true_loop=False,
+        output_format=args.output_format,
+        instance_name="scf1",
+        runtime_loop_tiling_sizes=[4, 4],
+    )
+
+    if args.compile_mode == "compile-only":
+        exit(0)
+
+    num_samples = 100
+    sampled_indices = np.vstack([np.random.randint(0, 2048, num_samples)])
+
+    # Compute reference results for sampled indices
+    sampled_values = np.array(
+        [input_a[i] + 4 for i in zip(*sampled_indices)],
+        dtype=np.int32,
+    )
+
+    # Store as a dictionary
+    sampled_data = {
+        "shape": (2048),
+        "indices": sampled_indices,
+        "values": sampled_values,
+    }
+
+    runtime = get_air_runtime()
+    dtype = sampled_data["values"].dtype
+    shape = sampled_data["shape"]
+    if isinstance(shape, int):
+        shape = (shape,)
+    io_args = [
+        aie.utils.tensor(input_a),
+        aie.utils.tensor(np.zeros(shape, dtype)),
+    ]
+    exit(runtime.run_test(npu_kernel, io_args, refs={}, stochastic_refs=[sampled_data]))
diff --git a/test/xrt/36_cascade_vecmat_i32/run.py b/test/xrt/36_cascade_vecmat_i32/run.py
index b46648f69..5731eb393 100644
--- a/test/xrt/36_cascade_vecmat_i32/run.py
+++ b/test/xrt/36_cascade_vecmat_i32/run.py
@@ -13,8 +13,12 @@
 import argparse
 import sys
 
-from air.backend.xrt_runner import XRTRunner
-from air.backend.xrt import XRTBackend
+from air.backend.xrt import compile_air, get_air_runtime
+import aie.utils
+
+import numpy as np
+
+np.random.seed(42)
 
 parser = argparse.ArgumentParser(
     prog="run.py",
@@ -111,8 +115,8 @@
                   linalg.vecmat ins(%alloc_17, %alloc_18 : memref<32xi32, 2 : i32>, memref<32x32xi32, 2 : i32>) outs(%subview_16 : memref<32xi32, strided<[1]>, 2 : i32>)
                   memref.dealloc %alloc_17 : memref<32xi32, 2 : i32>
                   memref.dealloc %alloc_18 : memref<32x32xi32, 2 : i32>
-                }                
-                
+                }
+
                 scf.reduce(%alloc_12 : memref<32xi32, 2 : i32>) {
                 ^bb0(%a4: memref<32xi32, 2 : i32>, %a5: memref<32xi32, 2 : i32>):
                   linalg.add ins(%a4, %a5 : memref<32xi32, 2 : i32>, memref<32xi32, 2 : i32>) outs(%a4 : memref<32xi32, 2 : i32>)
@@ -166,28 +170,23 @@
         K,
     )
     input_b = np.arange(0, K * N, dtype=np.int32).reshape(K, N)
-    if args.compile_mode == "compile-and-run":
-        output_c = np.dot(input_a.astype(np.int32), input_b.astype(np.int32))
-        runner = XRTRunner(
-            verbose=args.verbose,
-            omit_while_true_loop=False,
-            runtime_loop_tiling_sizes=[4, 4],
-        )
-        exit(
-            runner.run_test(
-                air_module,
-                inputs=[input_a, input_b],
-                expected_outputs=[output_c],
-            )
-        )
 
-    elif args.compile_mode == "compile-only":
-        ###### Compile only
-        backend = XRTBackend(
-            verbose=args.verbose,
-            omit_while_true_loop=False,
-            runtime_loop_tiling_sizes=[4, 4],
-        )
-        module_function = backend.compile(air_module)
+    npu_kernel = compile_air(
+        air_module,
+        verbose=args.verbose,
+        omit_while_true_loop=False,
+        runtime_loop_tiling_sizes=[4, 4],
+    )
+
+    if args.compile_mode == "compile-only":
+        exit(0)
 
-        backend.unload()
+    output_c = np.dot(input_a.astype(np.int32), input_b.astype(np.int32))
+    runtime = get_air_runtime()
+    io_args = [
+        aie.utils.tensor(input_a),
+        aie.utils.tensor(input_b),
+        aie.utils.tensor(np.zeros(output_c.shape, output_c.dtype)),
+    ]
+    refs = {2: output_c}
+    exit(runtime.run_test(npu_kernel, io_args, refs=refs))
diff --git a/test/xrt/37_matmul_transform_4x4_bf16/run.py b/test/xrt/37_matmul_transform_4x4_bf16/run.py
index d950a6367..233ec64a9 100644
--- a/test/xrt/37_matmul_transform_4x4_bf16/run.py
+++ b/test/xrt/37_matmul_transform_4x4_bf16/run.py
@@ -9,12 +9,10 @@
 from air.compiler.util import run_transform
 import argparse
 
-from air.backend.xrt import XRTBackend
+from air.backend.xrt import compile_air, get_air_runtime
 from air.ir import *
 import air.passmanager
-
-from air.backend.xrt_runner import XRTRunner
-from air.backend.xrt import XRTBackend
+import aie.utils
 
 from ml_dtypes import bfloat16
 import numpy as np
@@ -165,59 +163,59 @@ def forward(lhs, rhs):
 N = args.N
 input_a = np.arange(0, M * K, dtype=bfloat16).reshape(M, K)
 input_b = np.arange(0, K * N, dtype=bfloat16).reshape(K, N)
-if args.compile_mode == "compile-and-run":
-    # Stochastically sample num_sample results, and pass to XRTRunner backend for verification.
-    num_samples = 100
-    sampled_indices = np.vstack(
-        [
-            np.random.randint(0, args.M, num_samples),  # i indices
-            np.random.randint(0, args.N, num_samples),  # j indices
-        ]
-    )
 
-    # Compute reference results for sampled indices
-    sampled_values = np.array(
-        [
-            np.sum(
-                (input_a[i, :].astype(np.float32) * input_b[:, j].astype(np.float32)),
-                dtype=np.float32,
-            )
-            for i, j in zip(*sampled_indices)
-        ],
-        dtype=np.float32,
-    )
+npu_kernel = compile_air(
+    air_module,
+    verbose=args.verbose,
+    omit_while_true_loop=False,
+    runtime_loop_tiling_sizes=[4, 4],
+    output_format=args.output_format,
+    instance_name="forward",
+)
 
-    # Store as a dictionary
-    sampled_data = {
-        "shape": (args.M, args.N),
-        "indices": sampled_indices,
-        "values": sampled_values,
-    }
-    runner = XRTRunner(
-        verbose=args.verbose,
-        omit_while_true_loop=False,
-        runtime_loop_tiling_sizes=[4, 4],
-        output_format=args.output_format,
-        instance_name="forward",
-    )
-    exit(
-        runner.run_test(
-            air_module,
-            inputs=[input_a, input_b],
-            stochastic_expected_outputs=[sampled_data],
-            rtol=1e-1,
+if args.compile_mode == "compile-only":
+    exit(0)
+
+# Stochastically sample num_sample results, and pass to runtime backend for verification.
+num_samples = 100
+sampled_indices = np.vstack(
+    [
+        np.random.randint(0, args.M, num_samples),  # i indices
+        np.random.randint(0, args.N, num_samples),  # j indices
+    ]
+)
+
+# Compute reference results for sampled indices
+sampled_values = np.array(
+    [
+        np.sum(
+            (input_a[i, :].astype(np.float32) * input_b[:, j].astype(np.float32)),
+            dtype=np.float32,
         )
-    )
+        for i, j in zip(*sampled_indices)
+    ],
+    dtype=np.float32,
+)
 
-elif args.compile_mode == "compile-only":
-    ###### Compile only
-    backend = XRTBackend(
-        verbose=args.verbose,
-        omit_while_true_loop=False,
-        output_format=args.output_format,
-        instance_name="forward",
-        runtime_loop_tiling_sizes=[4, 4],
+# Store as a dictionary
+sampled_data = {
+    "shape": (args.M, args.N),
+    "indices": sampled_indices,
+    "values": sampled_values,
+}
+
+runtime = get_air_runtime()
+dtype = sampled_data["values"].dtype
+shape = sampled_data["shape"]
+if isinstance(shape, int):
+    shape = (shape,)
+io_args = [
+    aie.utils.tensor(input_a),
+    aie.utils.tensor(input_b),
+    aie.utils.tensor(np.zeros(shape, dtype)),
+]
+exit(
+    runtime.run_test(
+        npu_kernel, io_args, refs={}, stochastic_refs=[sampled_data], rtol=1e-1
     )
-    module_function = backend.compile(air_module)
-
-    backend.unload()
+)
diff --git a/test/xrt/38_cascade_vecmat_transform_2x4_i32/run.py b/test/xrt/38_cascade_vecmat_transform_2x4_i32/run.py
index 48942b6f0..b416370f8 100644
--- a/test/xrt/38_cascade_vecmat_transform_2x4_i32/run.py
+++ b/test/xrt/38_cascade_vecmat_transform_2x4_i32/run.py
@@ -9,12 +9,10 @@
 from air.compiler.util import run_transform
 import argparse
 
-from air.backend.xrt import XRTBackend
+from air.backend.xrt import compile_air, get_air_runtime
 from air.ir import *
 import air.passmanager
-
-from air.backend.xrt_runner import XRTRunner
-from air.backend.xrt import XRTBackend
+import aie.utils
 
 import numpy as np
 
@@ -161,55 +159,54 @@ def forward(lhs, rhs):
 max_val = 1024  # Conservative estimate
 input_a = np.random.randint(1, max_val + 1, size=(1, K), dtype=np.int32)
 input_b = np.random.randint(1, max_val + 1, size=(K, N), dtype=np.int32)
-if args.compile_mode == "compile-and-run":
-    # Stochastically sample num_sample results, and pass to XRTRunner backend for verification.
-    num_samples = 100
-    sampled_indices = np.vstack(
-        [
-            np.random.randint(0, 1, num_samples),  # i indices
-            np.random.randint(0, args.N, num_samples),  # j indices
-        ]
-    )
 
-    # Compute reference results for sampled indices
-    sampled_values = np.array(
-        [
-            np.sum(
-                (input_a[i, :].astype(np.int32) * input_b[:, j].astype(np.int32)),
-                dtype=np.int32,
-            )
-            for i, j in zip(*sampled_indices)
-        ],
-        dtype=np.int32,
-    )
-    print(sampled_values)
-
-    # Store as a dictionary
-    sampled_data = {
-        "shape": (1, args.N),
-        "indices": sampled_indices,
-        "values": sampled_values,
-    }
-    runner = XRTRunner(
-        verbose=args.verbose,
-        omit_while_true_loop=False,
-        runtime_loop_tiling_sizes=[4, 4],
-    )
-    exit(
-        runner.run_test(
-            air_module,
-            inputs=[input_a, input_b],
-            stochastic_expected_outputs=[sampled_data],
-        )
-    )
+npu_kernel = compile_air(
+    air_module,
+    verbose=args.verbose,
+    omit_while_true_loop=False,
+    runtime_loop_tiling_sizes=[4, 4],
+)
 
-elif args.compile_mode == "compile-only":
-    ###### Compile only
-    backend = XRTBackend(
-        verbose=args.verbose,
-        omit_while_true_loop=False,
-        runtime_loop_tiling_sizes=[4, 4],
-    )
-    module_function = backend.compile(air_module)
+if args.compile_mode == "compile-only":
+    exit(0)
 
-    backend.unload()
+# Stochastically sample num_sample results, and pass to runtime backend for verification.
+num_samples = 100
+sampled_indices = np.vstack(
+    [
+        np.random.randint(0, 1, num_samples),  # i indices
+        np.random.randint(0, args.N, num_samples),  # j indices
+    ]
+)
+
+# Compute reference results for sampled indices
+sampled_values = np.array(
+    [
+        np.sum(
+            (input_a[i, :].astype(np.int32) * input_b[:, j].astype(np.int32)),
+            dtype=np.int32,
+        )
+        for i, j in zip(*sampled_indices)
+    ],
+    dtype=np.int32,
+)
+print(sampled_values)
+
+# Store as a dictionary
+sampled_data = {
+    "shape": (1, args.N),
+    "indices": sampled_indices,
+    "values": sampled_values,
+}
+
+runtime = get_air_runtime()
+dtype = sampled_data["values"].dtype
+shape = sampled_data["shape"]
+if isinstance(shape, int):
+    shape = (shape,)
+io_args = [
+    aie.utils.tensor(input_a),
+    aie.utils.tensor(input_b),
+    aie.utils.tensor(np.zeros(shape, dtype)),
+]
+exit(runtime.run_test(npu_kernel, io_args, refs={}, stochastic_refs=[sampled_data]))
diff --git a/test/xrt/39_triton_matmul_ver3_vectorized/run.py b/test/xrt/39_triton_matmul_ver3_vectorized/run.py
index 5099f2452..8f2a8128a 100644
--- a/test/xrt/39_triton_matmul_ver3_vectorized/run.py
+++ b/test/xrt/39_triton_matmul_ver3_vectorized/run.py
@@ -4,13 +4,12 @@
 # SPDX-License-Identifier: MIT
 
 import argparse
-from air.backend.xrt import XRTBackend
-from air.backend.xrt_runner import XRTRunner
+from air.backend.xrt import compile_air, get_air_runtime
 from air.compiler.util import run_transform
 from air.ir import *
 import air.passmanager
 from ml_dtypes import bfloat16
-import filelock
+import aie.utils
 
 import numpy as np
 
@@ -125,15 +124,13 @@
     C = np.matmul(A, B).astype(output_type)  # Shape [M, N]
 
     ###### Compile and test
-    runner = XRTRunner(
-        omit_while_true_loop=False,
-        runtime_loop_tiling_sizes=[4, 4],
-    )
-    exit(
-        runner.run_test(
-            air_module,
-            inputs=[A, B],
-            expected_outputs=[C],
-            rtol=1e-3,
-        )
+    npu_kernel = compile_air(
+        air_module, omit_while_true_loop=False, runtime_loop_tiling_sizes=[4, 4]
     )
+    runtime = get_air_runtime()
+    io_args = [
+        aie.utils.tensor(A),
+        aie.utils.tensor(B),
+        aie.utils.tensor(np.zeros(C.shape, C.dtype)),
+    ]
+    exit(runtime.run_test(npu_kernel, io_args, refs={2: C}, rtol=1e-3))
diff --git a/test/xrt/40_triton_vec_add/run.py b/test/xrt/40_triton_vec_add/run.py
index 0700e59eb..28e64c026 100644
--- a/test/xrt/40_triton_vec_add/run.py
+++ b/test/xrt/40_triton_vec_add/run.py
@@ -4,13 +4,12 @@
 # SPDX-License-Identifier: MIT
 
 import argparse
-from air.backend.xrt import XRTBackend
-from air.backend.xrt_runner import XRTRunner
+from air.backend.xrt import compile_air, get_air_runtime
 from air.compiler.util import run_transform
 from air.ir import *
 import air.passmanager
 from ml_dtypes import bfloat16
-import filelock
+import aie.utils
 import numpy as np
 
 np.random.seed(42)
@@ -130,16 +129,16 @@
     C = np.add(A, B).astype(output_type)  # Shape [M]
 
     ###### Compile and test
-    runner = XRTRunner(
+    npu_kernel = compile_air(
+        air_module,
         omit_while_true_loop=False,
         use_lock_race_condition_fix=True,
         runtime_loop_tiling_sizes=[4, 4],
     )
-    exit(
-        runner.run_test(
-            air_module,
-            inputs=[A, B],
-            expected_outputs=[C],
-            rtol=1e-2,
-        )
-    )
+    runtime = get_air_runtime()
+    io_args = [
+        aie.utils.tensor(A),
+        aie.utils.tensor(B),
+        aie.utils.tensor(np.zeros(C.shape, C.dtype)),
+    ]
+    exit(runtime.run_test(npu_kernel, io_args, refs={2: C}, rtol=1e-2))
diff --git a/test/xrt/41_triton_softmax/run.py b/test/xrt/41_triton_softmax/run.py
index 392a9ef32..b8f299ed0 100644
--- a/test/xrt/41_triton_softmax/run.py
+++ b/test/xrt/41_triton_softmax/run.py
@@ -7,12 +7,11 @@
 import numpy as np
 
 np.random.seed(42)
-from air.backend.xrt import XRTBackend
-from air.backend.xrt_runner import XRTRunner
+from air.backend.xrt import compile_air, get_air_runtime
 from air.compiler.util import run_transform
 from air.ir import *
 import air.passmanager
-import filelock
+import aie.utils
 
 parser = argparse.ArgumentParser(
     prog="run.py",
@@ -200,35 +199,18 @@ def softmax(x, axis=-1):
     # Run compile and load
     ###############################################
 
+    npu_kernel = compile_air(
+        air_module, omit_while_true_loop=False, runtime_loop_tiling_sizes=[4, 4]
+    )
+
     if args.compile_only:
-        # Compile-only mode: generate xclbin and instruction binary without validation
-        print("Compile-only mode: generating xclbin and instruction binary...")
-        backend = XRTBackend(
-            omit_while_true_loop=False, runtime_loop_tiling_sizes=[4, 4]
-        )
-        module_function = backend.compile(air_module)
-        backend.unload()
-        print("Compilation complete. Generated files:")
-        print("  - air.xclbin")
-        print("  - air.insts.bin")
-        print("Run profiling with: ./test.exe")
         exit(0)
-    else:
-        # Normal mode: compile and run validation
-        input_type = np.float32
-        A = np.random.rand(M, N).astype(input_type)  # Shape [M, N]
-        C = softmax(A).astype(input_type)
-
-        ###### Compile and test
-        runner = XRTRunner(
-            omit_while_true_loop=False,
-            runtime_loop_tiling_sizes=[4, 4],
-        )
-        exit(
-            runner.run_test(
-                air_module,
-                inputs=[A],
-                expected_outputs=[C],
-                rtol=1e-2,
-            )
-        )
+
+    # Normal mode: compile and run validation
+    input_type = np.float32
+    A = np.random.rand(M, N).astype(input_type)  # Shape [M, N]
+    C = softmax(A).astype(input_type)
+
+    runtime = get_air_runtime()
+    io_args = [aie.utils.tensor(A), aie.utils.tensor(np.zeros(C.shape, C.dtype))]
+    exit(runtime.run_test(npu_kernel, io_args, refs={1: C}, rtol=1e-2))
diff --git a/test/xrt/42_triton_softmax_bf16/run.py b/test/xrt/42_triton_softmax_bf16/run.py
index 40f01d479..9a03b3af3 100644
--- a/test/xrt/42_triton_softmax_bf16/run.py
+++ b/test/xrt/42_triton_softmax_bf16/run.py
@@ -7,12 +7,11 @@
 import numpy as np
 
 np.random.seed(42)
-from air.backend.xrt import XRTBackend
-from air.backend.xrt_runner import XRTRunner
+from air.backend.xrt import compile_air, get_air_runtime
 from air.compiler.util import run_transform
 from air.ir import *
 import air.passmanager
-import filelock
+import aie.utils
 from ml_dtypes import bfloat16
 
 parser = argparse.ArgumentParser(
@@ -201,18 +200,13 @@ def softmax(x, axis=-1):
     C = softmax(A).astype(input_type)
 
     ###### Compile and test
-    runner = XRTRunner(
+    npu_kernel = compile_air(
+        air_module,
         omit_while_true_loop=False,
         output_format=args.output_format,
         instance_name="softmax_kernel",
         runtime_loop_tiling_sizes=[4, 4],
     )
-    exit(
-        runner.run_test(
-            air_module,
-            inputs=[A],
-            expected_outputs=[C],
-            rtol=1e-2,
-            atol=1e-3,
-        )
-    )
+    runtime = get_air_runtime()
+    io_args = [aie.utils.tensor(A), aie.utils.tensor(np.zeros(C.shape, C.dtype))]
+    exit(runtime.run_test(npu_kernel, io_args, refs={1: C}, rtol=1e-2, atol=1e-3))
diff --git a/test/xrt/43_triton_layernorm/run.py b/test/xrt/43_triton_layernorm/run.py
index 43ce473e8..2baa7346b 100644
--- a/test/xrt/43_triton_layernorm/run.py
+++ b/test/xrt/43_triton_layernorm/run.py
@@ -5,12 +5,11 @@
 
 import argparse
 import numpy as np
-from air.backend.xrt import XRTBackend
-from air.backend.xrt_runner import XRTRunner
+from air.backend.xrt import compile_air, get_air_runtime
 from air.compiler.util import run_transform
 from air.ir import *
 import air.passmanager
-import filelock
+import aie.utils
 
 parser = argparse.ArgumentParser(
     prog="run.py",
@@ -295,7 +294,8 @@ def layer_norm(x_arg, y, weight, bias, mean, rstd, eps=1e-5):
     y_expected = (x_arg - mean.reshape(-1, 1)) * rstd.reshape(-1, 1)  # Shape [M, N]
 
     ###### Compile and test
-    runner = XRTRunner(
+    npu_kernel = compile_air(
+        air_module,
         omit_while_true_loop=False,
         output_format=args.output_format,
         instance_name="_layer_norm_fwd_fused",
@@ -304,11 +304,16 @@ def layer_norm(x_arg, y, weight, bias, mean, rstd, eps=1e-5):
         debug_ir=args.debug_ir,
         runtime_loop_tiling_sizes=[4, 4],
     )
+    runtime = get_air_runtime()
+    io_args = [
+        aie.utils.tensor(x_arg),
+        aie.utils.tensor(np.zeros(y_expected.shape, y_expected.dtype)),
+    ]
     exit(
-        runner.run_test(
-            air_module,
-            inputs=[x_arg],
-            expected_outputs=[y_expected],
+        runtime.run_test(
+            npu_kernel,
+            io_args,
+            refs={1: y_expected},
             rtol=5e-2 if args.bf16_emulation else 1e-2,
             atol=5e-1 if args.bf16_emulation else 1e-1,
         )
diff --git a/test/xrt/44_triton_matmul_ver4_vector_ptr_opt/run.py b/test/xrt/44_triton_matmul_ver4_vector_ptr_opt/run.py
index 0f3414561..1264eb960 100644
--- a/test/xrt/44_triton_matmul_ver4_vector_ptr_opt/run.py
+++ b/test/xrt/44_triton_matmul_ver4_vector_ptr_opt/run.py
@@ -4,13 +4,12 @@
 # SPDX-License-Identifier: MIT
 
 import argparse
-from air.backend.xrt import XRTBackend
-from air.backend.xrt_runner import XRTRunner
+from air.backend.xrt import compile_air, get_air_runtime
 from air.compiler.util import run_transform
 from air.ir import *
 import air.passmanager
 from ml_dtypes import bfloat16
-import filelock
+import aie.utils
 
 import numpy as np
 
@@ -133,17 +132,17 @@
     C = np.matmul(A, B).astype(output_type)  # Shape [M, N]
 
     ###### Compile and test
-    runner = XRTRunner(
+    npu_kernel = compile_air(
+        air_module,
         omit_while_true_loop=False,
         runtime_loop_tiling_sizes=[4, 4],
         output_format=args.output_format,
         instance_name="bare_matmul",
     )
-    exit(
-        runner.run_test(
-            air_module,
-            inputs=[A, B],
-            expected_outputs=[C],
-            rtol=1e-1,
-        )
-    )
+    runtime = get_air_runtime()
+    io_args = [
+        aie.utils.tensor(A),
+        aie.utils.tensor(B),
+        aie.utils.tensor(np.zeros(C.shape, C.dtype)),
+    ]
+    exit(runtime.run_test(npu_kernel, io_args, refs={2: C}, rtol=1e-1))
diff --git a/test/xrt/45_triton_matmul_ver4_strix_8x4/run.py b/test/xrt/45_triton_matmul_ver4_strix_8x4/run.py
index fb2ca22dd..b42523778 100644
--- a/test/xrt/45_triton_matmul_ver4_strix_8x4/run.py
+++ b/test/xrt/45_triton_matmul_ver4_strix_8x4/run.py
@@ -4,13 +4,12 @@
 # SPDX-License-Identifier: MIT
 
 import argparse
-from air.backend.xrt import XRTBackend
-from air.backend.xrt_runner import XRTRunner
+from air.backend.xrt import compile_air, get_air_runtime
 from air.compiler.util import run_transform
 from air.ir import *
 import air.passmanager
 from ml_dtypes import bfloat16
-import filelock
+import aie.utils
 
 import numpy as np
 
@@ -134,17 +133,17 @@
     C = np.matmul(A, B).astype(output_type)  # Shape [M, N]
 
     ###### Compile and test
-    runner = XRTRunner(
+    npu_kernel = compile_air(
+        air_module,
         omit_while_true_loop=False,
         runtime_loop_tiling_sizes=[4, 4],
         output_format=args.output_format,
         instance_name="bare_matmul",
     )
-    exit(
-        runner.run_test(
-            air_module,
-            inputs=[A, B],
-            expected_outputs=[C],
-            rtol=1e-1,
-        )
-    )
+    runtime = get_air_runtime()
+    io_args = [
+        aie.utils.tensor(A),
+        aie.utils.tensor(B),
+        aie.utils.tensor(np.zeros(C.shape, C.dtype)),
+    ]
+    exit(runtime.run_test(npu_kernel, io_args, refs={2: C}, rtol=1e-1))
diff --git a/test/xrt/46_triton_matmul_ver4_strix_8x4_i8_i8_i32/run.py b/test/xrt/46_triton_matmul_ver4_strix_8x4_i8_i8_i32/run.py
index 83c7cdf03..1f1d75723 100644
--- a/test/xrt/46_triton_matmul_ver4_strix_8x4_i8_i8_i32/run.py
+++ b/test/xrt/46_triton_matmul_ver4_strix_8x4_i8_i8_i32/run.py
@@ -7,12 +7,11 @@
 import numpy as np
 
 np.random.seed(42)
-from air.backend.xrt import XRTBackend
-from air.backend.xrt_runner import XRTRunner
+from air.backend.xrt import compile_air, get_air_runtime
 from air.compiler.util import run_transform
 from air.ir import *
 import air.passmanager
-import filelock
+import aie.utils
 from ml_dtypes import bfloat16 as bfloat16_t
 
 parser = argparse.ArgumentParser(
@@ -130,54 +129,31 @@
     # Run compile and load
     ###############################################
 
-    # Determine output file extension based on format
-    output_ext = "elf" if args.output_format == "elf" else "xclbin"
+    npu_kernel = compile_air(
+        air_module,
+        omit_while_true_loop=False,
+        output_format=args.output_format,
+        instance_name="bare_matmul",
+        runtime_loop_tiling_sizes=[4, 4],
+    )
 
     if args.compile_only:
-        # Compile-only mode: generate binary without validation
-        print(f"Compile-only mode: generating {output_ext} binary...")
-        backend = XRTBackend(
-            omit_while_true_loop=False,
-            output_format=args.output_format,
-            instance_name="bare_matmul",
-            runtime_loop_tiling_sizes=[4, 4],
-        )
-        module_function = backend.compile(air_module)
-        backend.unload()
-        print("Compilation complete. Generated files:")
-        print(f"  - air.{output_ext}")
-        if args.output_format == "xclbin":
-            print("  - air.insts.bin")
-        print("Run profiling with: ./test.exe")
         exit(0)
-    else:
-        # Normal mode: compile and run validation
-
-        input_type = np.int8
-        output_type = np.int32
-        A = np.random.randint(
-            low=0, high=8, size=(M, K), dtype=input_type
-        )  # Shape [M, K]
-        B = np.random.randint(
-            low=0, high=8, size=(K, N), dtype=input_type
-        )  # Shape [K, N]
-
-        C = np.matmul(A.astype(output_type), B.astype(output_type)).astype(
-            output_type
-        )  # Shape [M, N]
-
-        runner = XRTRunner(
-            omit_while_true_loop=False,
-            runtime_loop_tiling_sizes=[4, 4],
-            output_format=args.output_format,
-            instance_name="bare_matmul",
-            # verbose=True,
-        )
-        exit(
-            runner.run_test(
-                air_module,
-                inputs=[A, B],
-                expected_outputs=[C],
-                # rtol=1e-1,
-            )
-        )
+
+    # Normal mode: compile and run validation
+    input_type = np.int8
+    output_type = np.int32
+    A = np.random.randint(low=0, high=8, size=(M, K), dtype=input_type)  # Shape [M, K]
+    B = np.random.randint(low=0, high=8, size=(K, N), dtype=input_type)  # Shape [K, N]
+
+    C = np.matmul(A.astype(output_type), B.astype(output_type)).astype(
+        output_type
+    )  # Shape [M, N]
+
+    runtime = get_air_runtime()
+    io_args = [
+        aie.utils.tensor(A),
+        aie.utils.tensor(B),
+        aie.utils.tensor(np.zeros(C.shape, C.dtype)),
+    ]
+    exit(runtime.run_test(npu_kernel, io_args, refs={2: C}))
diff --git a/test/xrt/47_multi_launch_pdi_reconfig/run.py b/test/xrt/47_multi_launch_pdi_reconfig/run.py
index 90f405955..77e0a2f5a 100644
--- a/test/xrt/47_multi_launch_pdi_reconfig/run.py
+++ b/test/xrt/47_multi_launch_pdi_reconfig/run.py
@@ -5,13 +5,14 @@
 # SPDX-License-Identifier: MIT
 
 """
-Test script that uses XRTRunner to compile, run, and validate the
+Test script that compiles, runs, and validates the
 multi-launch PDI reconfiguration example using ELF output format.
 """
 
 import numpy as np
-from air.backend.xrt_runner import XRTRunner
+from air.backend.xrt import compile_air, get_air_runtime
 from air.ir import *
+import aie.utils
 
 # Define the AIR module with two air.launch operations using iteration spaces
 # - Launch 1 (add_two): iterates 8 times, processing tiles at offsets 0,16,32,...,112 (adds 2)
@@ -157,21 +158,19 @@ def main():
     with Context() as ctx, Location.unknown():
         air_module = Module.parse(air_tiled_ir_string)
 
-        # Create XRTRunner with ELF output format
-        # instance_name should match the func.func name (@reconfigure_example)
-        runner = XRTRunner(
+        npu_kernel = compile_air(
+            air_module,
             output_format="elf",
-            instance_name="reconfigure_example",  # matches func.func @reconfigure_example
+            instance_name="reconfigure_example",
             omit_while_true_loop=False,
             runtime_loop_tiling_sizes=[4, 4],
         )
-
-        # Run the test
-        result = runner.run_test(
-            mlir_module=air_module,
-            inputs=[input_data],
-            expected_outputs=[expected_output],
-        )
+        runtime = get_air_runtime()
+        io_args = [
+            aie.utils.tensor(input_data),
+            aie.utils.tensor(np.zeros(expected_output.shape, expected_output.dtype)),
+        ]
+        result = runtime.run_test(npu_kernel, io_args, refs={1: expected_output})
 
         return result
 
diff --git a/test/xrt/48_triton_matmul_ver4_strix_4x4_bf16_output/run.py b/test/xrt/48_triton_matmul_ver4_strix_4x4_bf16_output/run.py
index b6904605a..114e0ac2d 100644
--- a/test/xrt/48_triton_matmul_ver4_strix_4x4_bf16_output/run.py
+++ b/test/xrt/48_triton_matmul_ver4_strix_4x4_bf16_output/run.py
@@ -4,13 +4,12 @@
 # SPDX-License-Identifier: MIT
 
 import argparse
-from air.backend.xrt import XRTBackend
-from air.backend.xrt_runner import XRTRunner
+from air.backend.xrt import compile_air, get_air_runtime
 from air.compiler.util import run_transform
 from air.ir import *
 import air.passmanager
 from ml_dtypes import bfloat16
-import filelock
+import aie.utils
 
 import numpy as np
 
@@ -130,15 +129,13 @@
     C = np.matmul(A, B).astype(output_type)  # Shape [M, N]
 
     ###### Compile and test
-    runner = XRTRunner(
-        omit_while_true_loop=False,
-        runtime_loop_tiling_sizes=[4, 4],
-    )
-    exit(
-        runner.run_test(
-            air_module,
-            inputs=[A, B],
-            expected_outputs=[C],
-            rtol=1e-1,
-        )
+    npu_kernel = compile_air(
+        air_module, omit_while_true_loop=False, runtime_loop_tiling_sizes=[4, 4]
     )
+    runtime = get_air_runtime()
+    io_args = [
+        aie.utils.tensor(A),
+        aie.utils.tensor(B),
+        aie.utils.tensor(np.zeros(C.shape, C.dtype)),
+    ]
+    exit(runtime.run_test(npu_kernel, io_args, refs={2: C}, rtol=1e-1))
diff --git a/test/xrt/49_triton_softmax_optimized_bf16_strix/run.py b/test/xrt/49_triton_softmax_optimized_bf16_strix/run.py
index 130465fa5..615552fdb 100644
--- a/test/xrt/49_triton_softmax_optimized_bf16_strix/run.py
+++ b/test/xrt/49_triton_softmax_optimized_bf16_strix/run.py
@@ -11,12 +11,11 @@
 import numpy as np
 
 np.random.seed(42)
-from air.backend.xrt import XRTBackend
-from air.backend.xrt_runner import XRTRunner
+from air.backend.xrt import compile_air, get_air_runtime
 from air.compiler.util import run_transform
 from air.ir import *
 import air.passmanager
-import filelock
+import aie.utils
 from ml_dtypes import bfloat16
 
 # Get the directory containing this script
@@ -258,48 +257,35 @@ def softmax(x, axis=-1):
     # Run compile and load
     ###############################################
 
+    npu_kernel = compile_air(
+        air_module,
+        omit_while_true_loop=False,
+        verbose=args.verbose,
+        debug_ir=args.debug_aircc,
+        output_format=args.output_format,
+        instance_name="softmax_kernel",
+        runtime_loop_tiling_sizes=[4, 4],
+    )
+
     if args.compile_only:
-        # Compile-only mode: generate xclbin and instruction binary without validation
-        print("Compile-only mode: generating xclbin and instruction binary...")
-        backend = XRTBackend(
-            omit_while_true_loop=False,
-            verbose=args.verbose,
-            debug_ir=args.debug_aircc,
-            output_format=args.output_format,
-            instance_name="softmax_kernel",
-            runtime_loop_tiling_sizes=[4, 4],
-        )
-        module_function = backend.compile(air_module)
-        backend.unload()
-        print("Compilation complete. Generated files:")
-        print("  - air.xclbin")
-        print("  - air.insts.bin")
-        print("Run profiling with: ./test.exe")
         exit(0)
-    else:
-        # Normal mode: compile and run validation
-        input_type = bfloat16
-        # Generate random input in range [-512, 512]
-        A = (np.random.rand(M, N) * 1024 - 512).astype(
-            input_type
-        )  # Shape [M, N], range [-512, 512]
-        C = softmax(A).astype(input_type)
 
-        ###### Compile and test
-        runner = XRTRunner(
-            omit_while_true_loop=False,
-            verbose=args.verbose,
-            debug_ir=args.debug_aircc,
-            output_format=args.output_format,
-            instance_name="softmax_kernel",
-            runtime_loop_tiling_sizes=[4, 4],
-        )
-        exit(
-            runner.run_test(
-                air_module,
-                inputs=[A],
-                expected_outputs=[C],
-                rtol=0.04,  # 4% relative tolerance (matches mlir-aie reference)
-                atol=0.001,  # Absolute tolerance (matches mlir-aie reference)
-            )
+    # Normal mode: compile and run validation
+    input_type = bfloat16
+    # Generate random input in range [-512, 512]
+    A = (np.random.rand(M, N) * 1024 - 512).astype(
+        input_type
+    )  # Shape [M, N], range [-512, 512]
+    C = softmax(A).astype(input_type)
+
+    runtime = get_air_runtime()
+    io_args = [aie.utils.tensor(A), aie.utils.tensor(np.zeros(C.shape, C.dtype))]
+    exit(
+        runtime.run_test(
+            npu_kernel,
+            io_args,
+            refs={1: C},
+            rtol=0.04,  # 4% relative tolerance (matches mlir-aie reference)
+            atol=0.001,
         )
+    )  # Absolute tolerance (matches mlir-aie reference)
diff --git a/test/xrt/50_multi_launch_attention/run.py b/test/xrt/50_multi_launch_attention/run.py
index 928a5843b..791658428 100644
--- a/test/xrt/50_multi_launch_attention/run.py
+++ b/test/xrt/50_multi_launch_attention/run.py
@@ -15,11 +15,12 @@
 4. Comparing results
 """
 
-from air.backend.xrt_runner import XRTRunner
+from air.backend.xrt import compile_air, get_air_runtime
 from air.ir import *
 from ml_dtypes import bfloat16
 import numpy as np
 import os
+import aie.utils
 
 
 def softmax(x, axis=-1):
@@ -74,18 +75,21 @@ def softmax(x, axis=-1):
     # Run test
     ###############################################
 
-    runner = XRTRunner(
+    npu_kernel = compile_air(
+        air_module,
         output_format="elf",
-        instance_name="attention",  # matches func.func @attention
+        instance_name="attention",
         omit_while_true_loop=False,
         verbose=False,
         runtime_loop_tiling_sizes=[4, 4],
     )
-    exit(
-        runner.run_test(
-            mlir_module=air_module,
-            inputs=[Q, K_T, V, S_buffer, P_buffer],
-            expected_outputs=[O_expected],
-            atol=2e3,
-        )
-    )
+    runtime = get_air_runtime()
+    io_args = [
+        aie.utils.tensor(Q),
+        aie.utils.tensor(K_T),
+        aie.utils.tensor(V),
+        aie.utils.tensor(S_buffer),
+        aie.utils.tensor(P_buffer),
+        aie.utils.tensor(np.zeros(O_expected.shape, O_expected.dtype)),
+    ]
+    exit(runtime.run_test(npu_kernel, io_args, refs={5: O_expected}, atol=2e3))
diff --git a/test/xrt/51_scf_if_channel_herd/run.py b/test/xrt/51_scf_if_channel_herd/run.py
index 4d5d4cbb4..d21118de4 100644
--- a/test/xrt/51_scf_if_channel_herd/run.py
+++ b/test/xrt/51_scf_if_channel_herd/run.py
@@ -21,7 +21,9 @@
 from air.dialects.func import FuncOp
 from air.dialects import arith, scf
 from air.dialects.scf import for_, yield_
-from air.backend.xrt_runner import XRTRunner, type_mapper
+from air.backend.xrt import compile_air, get_air_runtime
+from air.backend.xrt_runner import type_mapper
+import aie.utils
 
 range_ = for_
 
@@ -143,10 +145,16 @@ def herd_body(tx, ty, sx, sy):
     input_a = np.full(IMAGE_SIZE, 0x2, dtype=INOUT_DATATYPE)
     output_b = np.full(IMAGE_SIZE, 0x5, dtype=INOUT_DATATYPE)
 
-    runner = XRTRunner(
+    npu_kernel = compile_air(
+        mlir_module,
         verbose=args.verbose,
         output_format=args.output_format,
         instance_name="copy",
         runtime_loop_tiling_sizes=[4, 4],
     )
-    exit(runner.run_test(mlir_module, inputs=[input_a], expected_outputs=[output_b]))
+    runtime = get_air_runtime()
+    io_args = [
+        aie.utils.tensor(input_a),
+        aie.utils.tensor(np.zeros(output_b.shape, output_b.dtype)),
+    ]
+    exit(runtime.run_test(npu_kernel, io_args, refs={1: output_b}))
diff --git a/test/xrt/52_dma_pad_passthrough/run.py b/test/xrt/52_dma_pad_passthrough/run.py
index 85dc000c4..a8aaaca65 100644
--- a/test/xrt/52_dma_pad_passthrough/run.py
+++ b/test/xrt/52_dma_pad_passthrough/run.py
@@ -22,7 +22,9 @@
 from air.dialects import arith
 from air.dialects.memref import AllocOp, DeallocOp
 from air.dialects.func import FuncOp
-from air.backend.xrt_runner import XRTRunner, type_mapper
+from air.backend.xrt import compile_air, get_air_runtime
+from air.backend.xrt_runner import type_mapper
+import aie.utils
 
 INPUT_ROWS = 64
 INPUT_COLS = 480
@@ -189,16 +191,16 @@ def herd_body(tx, ty, sx, sy, h_l2_in, h_l2_out):
         "values": sampled_values,
     }
 
-    runner = XRTRunner(
+    npu_kernel = compile_air(
+        mlir_module,
         verbose=args.verbose,
         output_format=args.output_format,
         instance_name="pad_passthrough",
         runtime_loop_tiling_sizes=[4, 4],
     )
-    exit(
-        runner.run_test(
-            mlir_module,
-            inputs=[input_data],
-            stochastic_expected_outputs=[sampled_data],
-        )
-    )
+    runtime = get_air_runtime()
+    io_args = [
+        aie.utils.tensor(input_data),
+        aie.utils.tensor(np.zeros((INPUT_ROWS, PADDED_COLS), INOUT_DATATYPE)),
+    ]
+    exit(runtime.run_test(npu_kernel, io_args, refs={}, stochastic_refs=[sampled_data]))
diff --git a/test/xrt/53_matmul_padding_bf16/run.py b/test/xrt/53_matmul_padding_bf16/run.py
index 8634ef1b6..320410c29 100644
--- a/test/xrt/53_matmul_padding_bf16/run.py
+++ b/test/xrt/53_matmul_padding_bf16/run.py
@@ -14,13 +14,12 @@
 import argparse
 import math
 
-from air.backend.xrt import XRTBackend
-from air.backend.xrt_runner import XRTRunner
+from air.backend.xrt import compile_air, get_air_runtime
 from air.compiler.util import run_transform
 from air.ir import *
 import air.passmanager
 from ml_dtypes import bfloat16
-import filelock
+import aie.utils
 
 import numpy as np
 
@@ -237,42 +236,44 @@
     C_ref = np.zeros((M_padded, N_padded), dtype=output_type)
     C_ref[:M_actual, :N_actual] = C_ref_actual
 
-    if args.compile_mode == "compile-and-run":
-        runner = XRTRunner(
-            verbose=args.verbose,
-            omit_while_true_loop=False,
-            runtime_loop_tiling_sizes=[4, 4],
-            output_format="elf" if needs_padding else "xclbin",
-            instance_name="matmul_bf16",
-        )
+    npu_kernel = compile_air(
+        air_module,
+        verbose=args.verbose,
+        omit_while_true_loop=False,
+        runtime_loop_tiling_sizes=[4, 4],
+        output_format="elf" if needs_padding else "xclbin",
+        instance_name="matmul_bf16",
+    )
 
-        num_samples = 200
-        sampled_row = np.random.randint(0, M_actual, num_samples)
-        sampled_col = np.random.randint(0, N_actual, num_samples)
-        sampled_indices = np.vstack([sampled_row, sampled_col])
-        sampled_values = np.array(
-            [C_ref_actual[r, c] for r, c in zip(sampled_row, sampled_col)],
-            dtype=output_type,
-        )
-        sampled_data = {
-            "shape": (M_padded, N_padded),
-            "indices": sampled_indices,
-            "values": sampled_values,
-        }
+    if args.compile_mode == "compile-only":
+        exit(0)
 
-        exit(
-            runner.run_test(
-                air_module,
-                inputs=[A, B],
-                stochastic_expected_outputs=[sampled_data],
-                rtol=max(1e-1, 2e-2 * (K_FULL / K_L2_TILE)),
-            )
-        )
-    elif args.compile_mode == "compile-only":
-        backend = XRTBackend(
-            verbose=args.verbose,
-            omit_while_true_loop=False,
-            runtime_loop_tiling_sizes=[4, 4],
+    num_samples = 200
+    sampled_row = np.random.randint(0, M_actual, num_samples)
+    sampled_col = np.random.randint(0, N_actual, num_samples)
+    sampled_indices = np.vstack([sampled_row, sampled_col])
+    sampled_values = np.array(
+        [C_ref_actual[r, c] for r, c in zip(sampled_row, sampled_col)],
+        dtype=output_type,
+    )
+    sampled_data = {
+        "shape": (M_padded, N_padded),
+        "indices": sampled_indices,
+        "values": sampled_values,
+    }
+
+    runtime = get_air_runtime()
+    io_args = [
+        aie.utils.tensor(A),
+        aie.utils.tensor(B),
+        aie.utils.tensor(np.zeros((M_padded, N_padded), output_type)),
+    ]
+    exit(
+        runtime.run_test(
+            npu_kernel,
+            io_args,
+            refs={},
+            stochastic_refs=[sampled_data],
+            rtol=max(1e-1, 2e-2 * (K_FULL / K_L2_TILE)),
         )
-        module_function = backend.compile(air_module)
-        backend.unload()
+    )
diff --git a/test/xrt/54_matmul_padding_f32_bf16_emulation/run.py b/test/xrt/54_matmul_padding_f32_bf16_emulation/run.py
index 9ae0e65c8..07ca53838 100644
--- a/test/xrt/54_matmul_padding_f32_bf16_emulation/run.py
+++ b/test/xrt/54_matmul_padding_f32_bf16_emulation/run.py
@@ -14,12 +14,12 @@
 import math
 import os
 
-from air.backend.xrt import XRTBackend
-from air.backend.xrt_runner import XRTRunner
+from air.backend.xrt import compile_air, get_air_runtime
 from air.compiler.util import run_transform
 from air.ir import *
 import air.passmanager
 from ml_dtypes import bfloat16
+import aie.utils
 
 import numpy as np
 
@@ -213,83 +213,80 @@
     input_b = np.zeros((K_FULL, N_alloc), dtype=np.float32)
     input_b[:, :N_actual] = (np.random.rand(K_FULL, N_actual) * 4).astype(np.float32)
 
-    if args.compile_mode == "compile-and-run":
-        num_samples = 100
-        sampled_indices = np.vstack(
+    npu_kernel = compile_air(
+        air_module,
+        verbose=args.verbose,
+        omit_while_true_loop=False,
+        runtime_loop_tiling_sizes=[4, 4],
+        output_format="elf",
+        instance_name="matmul_padding_kernel",
+        bf16_emulation=True,
+        debug_ir=True,
+    )
+
+    if args.compile_mode == "compile-only":
+        exit(0)
+
+    num_samples = 100
+    sampled_indices = np.vstack(
+        [
+            np.random.randint(0, M_actual, num_samples),
+            np.random.randint(0, N_actual, num_samples),
+        ]
+    )
+
+    # Add deterministic boundary-tile samples to catch padding errors.
+    boundary_m = list(
+        set(
             [
-                np.random.randint(0, M_actual, num_samples),
-                np.random.randint(0, N_actual, num_samples),
+                min(M_actual - 1, m)
+                for m in [M_actual - 1, M_actual - TILE_M + 1, 0]
+                if m >= 0
             ]
         )
-
-        # Add deterministic boundary-tile samples to catch padding errors.
-        boundary_m = list(
-            set(
-                [
-                    min(M_actual - 1, m)
-                    for m in [M_actual - 1, M_actual - TILE_M + 1, 0]
-                    if m >= 0
-                ]
-            )
-        )
-        boundary_n = list(
-            set(
-                [
-                    min(N_actual - 1, n)
-                    for n in [N_actual - 1, N_actual - TILE_N + 1, 0]
-                    if n >= 0
-                ]
-            )
-        )
-        boundary_indices = np.array([[m, n] for m in boundary_m for n in boundary_n]).T
-        sampled_indices = np.hstack([sampled_indices, boundary_indices])
-
-        # Golden: truncate f32 inputs to bf16 (matching hardware truncf_op),
-        # then compute dot product with f32 accumulation.
-        input_a_bf16 = input_a.astype(bfloat16)
-        input_b_bf16 = input_b.astype(bfloat16)
-        sampled_values = np.array(
+    )
+    boundary_n = list(
+        set(
             [
-                np.sum(
-                    input_a_bf16[:, i].astype(np.float32)
-                    * input_b_bf16[:, j].astype(np.float32),
-                    dtype=np.float32,
-                )
-                for i, j in zip(*sampled_indices)
-            ],
-            dtype=np.float32,
+                min(N_actual - 1, n)
+                for n in [N_actual - 1, N_actual - TILE_N + 1, 0]
+                if n >= 0
+            ]
         )
+    )
+    boundary_indices = np.array([[m, n] for m in boundary_m for n in boundary_n]).T
+    sampled_indices = np.hstack([sampled_indices, boundary_indices])
 
-        sampled_data = {
-            "shape": (M_padded, N_padded),
-            "indices": sampled_indices,
-            "values": sampled_values,
-        }
-
-        runner = XRTRunner(
-            verbose=args.verbose,
-            omit_while_true_loop=False,
-            runtime_loop_tiling_sizes=[4, 4],
-            output_format="elf",
-            instance_name="matmul_padding_kernel",
-            bf16_emulation=True,
-            debug_ir=True,
-        )
-        exit(
-            runner.run_test(
-                air_module,
-                inputs=[input_a, input_b],
-                stochastic_expected_outputs=[sampled_data],
-                rtol=0.1,
+    # Golden: truncate f32 inputs to bf16 (matching hardware truncf_op),
+    # then compute dot product with f32 accumulation.
+    input_a_bf16 = input_a.astype(bfloat16)
+    input_b_bf16 = input_b.astype(bfloat16)
+    sampled_values = np.array(
+        [
+            np.sum(
+                input_a_bf16[:, i].astype(np.float32)
+                * input_b_bf16[:, j].astype(np.float32),
+                dtype=np.float32,
             )
+            for i, j in zip(*sampled_indices)
+        ],
+        dtype=np.float32,
+    )
+
+    sampled_data = {
+        "shape": (M_padded, N_padded),
+        "indices": sampled_indices,
+        "values": sampled_values,
+    }
+
+    runtime = get_air_runtime()
+    io_args = [
+        aie.utils.tensor(input_a),
+        aie.utils.tensor(input_b),
+        aie.utils.tensor(np.zeros((M_padded, N_padded), np.float32)),
+    ]
+    exit(
+        runtime.run_test(
+            npu_kernel, io_args, refs={}, stochastic_refs=[sampled_data], rtol=0.1
         )
-    elif args.compile_mode == "compile-only":
-        backend = XRTBackend(
-            verbose=args.verbose,
-            omit_while_true_loop=False,
-            output_format="elf",
-            bf16_emulation=True,
-            runtime_loop_tiling_sizes=[4, 4],
-        )
-        module_function = backend.compile(air_module)
-        backend.unload()
+    )
diff --git a/test/xrt/55_matmul_padding_bf16_npu1/run.py b/test/xrt/55_matmul_padding_bf16_npu1/run.py
index 18dffe15f..0691c684a 100644
--- a/test/xrt/55_matmul_padding_bf16_npu1/run.py
+++ b/test/xrt/55_matmul_padding_bf16_npu1/run.py
@@ -28,9 +28,10 @@
 from air.dialects.memref import AllocOp, DeallocOp, load, store, subview
 from air.dialects.func import FuncOp
 from air.dialects.scf import for_, yield_
-from air.backend.xrt_runner import XRTRunner, type_mapper
-from air.backend.xrt import XRTBackend
+from air.backend.xrt import compile_air, get_air_runtime
+from air.backend.xrt_runner import type_mapper
 from air.compiler.util import run_transform
+import aie.utils
 from air.extras import types as extrasT
 from air.dialects.linalg.opdsl.lang import *
 import air.dialects.linalg.opdsl.lang as linalg_lang
@@ -697,17 +698,25 @@ def epilogue_herd(
         "values": sampled_values,
     }
 
-    runner = XRTRunner(
+    npu_kernel = compile_air(
+        mlir_module,
         verbose=args.verbose,
         omit_while_true_loop=False,
         runtime_loop_tiling_sizes=[4, 4],
         instance_name="matmul_f32",
     )
+    runtime = get_air_runtime()
+    io_args = [
+        aie.utils.tensor(input_a),
+        aie.utils.tensor(input_b),
+        aie.utils.tensor(np.zeros((M_padded, N_padded), np.float32)),
+    ]
     exit(
-        runner.run_test(
-            mlir_module,
-            inputs=[input_a, input_b],
-            stochastic_expected_outputs=[sampled_data],
+        runtime.run_test(
+            npu_kernel,
+            io_args,
+            refs={},
+            stochastic_refs=[sampled_data],
             rtol=0.1,
             max_mismatch_percentage=10,
         )