Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
128 changes: 30 additions & 98 deletions programming_examples/average_pool/average_pool.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,9 @@
Uses a 1x2 AIE herd with DMA transfers between L3 and L1 memory.
"""

import argparse
from ml_dtypes import bfloat16

from air.ir import *
from air.dialects.affine import apply as affine_apply
from air.dialects.air import *
from air.dialects import arith
from air.dialects.arith import ConstantOp
Expand All @@ -29,8 +27,7 @@
)
from air.dialects.func import FuncOp
from air.dialects.scf import for_, yield_
from air.backend.xrt_runner import XRTRunner, type_mapper
from air.backend.xrt import XRTBackend
from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu

import numpy as np

Expand All @@ -54,16 +51,8 @@ def build_module(m, n, tile_m, np_dtype_in):
l3outputMemrefTy = MemRefType.get(out_size, xrt_dtype_in)

# L1 MemRefTypes
l1MemrefTy = MemRefType.get(
shape=[tile_m, n],
element_type=xrt_dtype_in,
memory_space=IntegerAttr.get(T.i32(), MemorySpace.L1),
)
l1outputMemrefTy = MemRefType.get(
shape=[tile_m, 1],
element_type=xrt_dtype_in,
memory_space=IntegerAttr.get(T.i32(), MemorySpace.L1),
)
l1MemrefTy = l1_memref_type([tile_m, n], xrt_dtype_in)
l1outputMemrefTy = l1_memref_type([tile_m, 1], xrt_dtype_in)

@FuncOp.from_py_func(l3memrefTy, l3outputMemrefTy)
def average_pool(arg0, arg2):
Expand All @@ -85,20 +74,7 @@ def herd_body(

for _l_ivx in range_(0, m, tile_m * num_tiles):

offset_map = AffineMap.get(
0,
2,
[
AffineExpr.get_add(
AffineSymbolExpr.get(0),
AffineExpr.get_mul(
AffineSymbolExpr.get(1),
AffineConstantExpr.get(tile_m),
),
)
],
)
offset = affine_apply(offset_map, [_l_ivx, _ty])
offset = tile_offset_1d(_l_ivx, _ty, tile_m)

dma_memcpy_nd(
l1_a_data,
Expand Down Expand Up @@ -151,16 +127,16 @@ def herd_body(
)
cst0 = arith.ConstantOp(xrt_dtype_in, 0.0)
v_a = transfer_read(
VectorType.get([n], xrt_dtype_in),
vec_type(n, xrt_dtype_in),
collapse_a,
[c0],
AffineMapAttr.get(AffineMap.get_identity(1)),
identity_map_attr(),
cst0,
[True],
)
# Multiply by 1/N before reduction to avoid scalar bf16
# multiply which can produce corrupted output on AIE2.
v_inv_n = broadcast(VectorType.get([n], xrt_dtype_in), inv_n)
v_inv_n = broadcast(vec_type(n, xrt_dtype_in), inv_n)
v_scaled = arith.mulf(v_a, v_inv_n)
v_avg = reduction(xrt_dtype_in, CombiningKind.ADD, v_scaled)
store(v_avg, collapse_c, [c0])
Expand Down Expand Up @@ -188,20 +164,7 @@ def herd_body(
TILE_M = 256
INPUT_DATATYPE = bfloat16

parser = argparse.ArgumentParser(
prog="run.py",
description="Builds, runs, and tests the AveragePool example",
)
parser.add_argument(
"-v",
"--verbose",
action="store_true",
)
parser.add_argument(
"-p",
"--print-module-only",
action="store_true",
)
parser = make_air_parser("Builds, runs, and tests the AveragePool example")
parser.add_argument(
"--m",
type=int,
Expand All @@ -215,20 +178,6 @@ def herd_body(
help="Input size (dimension N, pool width)",
)
parser.add_argument("--tile-m", type=int, default=TILE_M, help="Tile size M")
parser.add_argument(
"--compile-mode",
type=str,
choices=["compile-only", "compile-and-run"],
dest="compile_mode",
default="compile-and-run",
)
parser.add_argument(
"--output-format",
type=str,
choices=["xclbin", "elf"],
default="xclbin",
dest="output_format",
)

args = parser.parse_args()

Expand All @@ -246,46 +195,29 @@ def herd_body(
args.m, args.n
)

if args.compile_mode == "compile-and-run":

num_samples = 100
sampled_indices = np.vstack([np.random.randint(0, args.m, num_samples)])

# AveragePool reference: sum of (each element * 1/N) per row
inv_n_bf16 = INPUT_DATATYPE(1.0 / args.n)
sampled_values = np.array(
[np.sum(input_a[i] * inv_n_bf16) for i in zip(*sampled_indices)],
dtype=INPUT_DATATYPE,
)
num_samples = 100
sampled_indices = np.vstack([np.random.randint(0, args.m, num_samples)])

sampled_data = {
"shape": (args.m,),
"indices": sampled_indices,
"values": sampled_values,
}
# AveragePool reference: sum of (each element * 1/N) per row
inv_n_bf16 = INPUT_DATATYPE(1.0 / args.n)
sampled_values = np.array(
[np.sum(input_a[i] * inv_n_bf16) for i in zip(*sampled_indices)],
dtype=INPUT_DATATYPE,
)

runner = XRTRunner(
verbose=args.verbose,
omit_while_true_loop=False,
output_format=args.output_format,
sampled_data = {
"shape": (args.m,),
"indices": sampled_indices,
"values": sampled_values,
}

exit(
run_on_npu(
args,
mlir_module,
inputs=[input_a],
instance_name="average_pool",
runtime_loop_tiling_sizes=[4, 4],
)
exit(
runner.run_test(
mlir_module,
inputs=[input_a],
stochastic_expected_outputs=[sampled_data],
rtol=1e-1,
)
stochastic_expected_outputs=[sampled_data],
rtol=1e-1,
)

elif args.compile_mode == "compile-only":
backend = XRTBackend(
verbose=args.verbose,
omit_while_true_loop=False,
output_format=args.output_format,
runtime_loop_tiling_sizes=[4, 4],
)
module_function = backend.compile(mlir_module)
backend.unload()
)
145 changes: 36 additions & 109 deletions programming_examples/axpy/axpy.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,20 +13,23 @@
with configurable VECTOR_SIZE (default 16).
"""

import argparse
import os
import sys

sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

from ml_dtypes import bfloat16

from air.ir import *
from air.dialects.affine import apply as affine_apply
from air.dialects.air import *
from air.dialects import arith
from air.dialects.arith import ConstantOp
from air.dialects.memref import AllocOp, DeallocOp, subview
from air.dialects.vector import transfer_read, transfer_write, BroadcastOp, fma
from air.dialects.memref import AllocOp, DeallocOp
from air.dialects.vector import BroadcastOp, fma
from air.dialects.func import FuncOp
from air.dialects.scf import for_, yield_
from air.backend.xrt_runner import XRTRunner, type_mapper
from air.backend.xrt import XRTBackend
from air.backend.xrt_runner import type_mapper, make_air_parser, run_on_npu
from utils import vec_read, vec_write

import numpy as np

Expand All @@ -44,18 +47,10 @@ def build_module(n, tile_n, np_dtype_in, alpha=2.0, vector_size=16):
VECTOR_SIZE = vector_size
index_type = IndexType.get()

# L3 MemRefTypes
l3memrefTy = MemRefType.get([n], xrt_dtype_in)

# L1 MemRefTypes
l1MemrefTy = MemRefType.get(
shape=[tile_n],
element_type=xrt_dtype_in,
memory_space=IntegerAttr.get(T.i32(), MemorySpace.L1),
)

vecTy = VectorType.get([VECTOR_SIZE], xrt_dtype_in)
identity_map = AffineMapAttr.get(AffineMap.get_identity(1))
l1MemrefTy = l1_memref_type([tile_n], xrt_dtype_in)
vecTy = vec_type(VECTOR_SIZE, xrt_dtype_in)
imap = identity_map_attr()

@FuncOp.from_py_func(l3memrefTy, l3memrefTy, l3memrefTy)
def axpy(arg0, arg1, arg2):
Expand All @@ -80,21 +75,7 @@ def herd_body(
l1_out_data = AllocOp(l1MemrefTy, [], [])

for _l_ivx in range_(0, n, tile_n * num_tiles):

offset_map = AffineMap.get(
0,
2,
[
AffineExpr.get_add(
AffineSymbolExpr.get(0),
AffineExpr.get_mul(
AffineSymbolExpr.get(1),
AffineConstantExpr.get(tile_n),
),
)
],
)
offset = affine_apply(offset_map, [_l_ivx, _ty])
offset = tile_offset_1d(_l_ivx, _ty, tile_n)

dma_memcpy_nd(
l1_x_data,
Expand All @@ -121,29 +102,11 @@ def herd_body(
v_a = BroadcastOp(vecTy, a_const)

for j in range_(c0, cTileN, cVecSize):
sub_x = subview(
l1_x_data.result,
[j],
[VECTOR_SIZE],
[1],
)
sub_y = subview(
l1_y_data.result,
[j],
[VECTOR_SIZE],
[1],
)
sub_out = subview(
l1_out_data.result,
[j],
[VECTOR_SIZE],
[1],
)
v_x = transfer_read(vecTy, sub_x, [c0], identity_map, cst0, [True])
v_y = transfer_read(vecTy, sub_y, [c0], identity_map, cst0, [True])
v_x = vec_read(l1_x_data, j, VECTOR_SIZE, c0, vecTy, cst0, imap)
v_y = vec_read(l1_y_data, j, VECTOR_SIZE, c0, vecTy, cst0, imap)
# a * x + y via vector.fma
v_result = fma(v_a, v_x, v_y)
transfer_write(None, v_result, sub_out, [c0], identity_map, [True])
vec_write(v_result, l1_out_data, j, VECTOR_SIZE, c0, imap)
yield_([])

# Write result from l1_out back to L3 output buffer
Expand All @@ -167,12 +130,7 @@ def herd_body(
INPUT_DATATYPE = bfloat16
ALPHA = 2.0

parser = argparse.ArgumentParser(
prog="run.py",
description="Builds, runs, and tests the AXPY example",
)
parser.add_argument("-v", "--verbose", action="store_true")
parser.add_argument("-p", "--print-module-only", action="store_true")
parser = make_air_parser("Builds, runs, and tests the AXPY example")
parser.add_argument("--n", type=int, default=N, help="Total number of elements")
parser.add_argument("--tile-n", type=int, default=TILE_N, help="Tile size")
parser.add_argument(
Expand All @@ -184,20 +142,6 @@ def herd_body(
default=16,
help="Vector size for SIMD operations",
)
parser.add_argument(
"--compile-mode",
type=str,
choices=["compile-only", "compile-and-run"],
dest="compile_mode",
default="compile-and-run",
)
parser.add_argument(
"--output-format",
type=str,
choices=["xclbin", "elf"],
default="xclbin",
dest="output_format",
)

args = parser.parse_args()

Expand All @@ -211,41 +155,24 @@ def herd_body(
input_x = np.random.randn(args.n).astype(INPUT_DATATYPE)
input_y = np.random.randn(args.n).astype(INPUT_DATATYPE)

if args.compile_mode == "compile-and-run":
num_samples = 100
sampled_indices = np.vstack([np.random.randint(0, args.n, num_samples)])
sampled_values = np.array(
[(args.alpha * input_x[i] + input_y[i]) for i in zip(*sampled_indices)],
dtype=INPUT_DATATYPE,
)
sampled_data = {
"shape": (args.n,),
"indices": sampled_indices,
"values": sampled_values,
}

runner = XRTRunner(
verbose=args.verbose,
omit_while_true_loop=False,
output_format=args.output_format,
sampled_indices = np.vstack([np.random.randint(0, args.n, 100)])
sampled_values = np.array(
[args.alpha * input_x[i] + input_y[i] for i in zip(*sampled_indices)],
dtype=INPUT_DATATYPE,
)
sampled_data = {
"shape": (args.n,),
"indices": sampled_indices,
"values": sampled_values,
}

exit(
run_on_npu(
args,
mlir_module,
inputs=[input_x, input_y],
instance_name="axpy",
runtime_loop_tiling_sizes=[4, 4],
stochastic_expected_outputs=[sampled_data],
rtol=1e-2,
)
exit(
runner.run_test(
mlir_module,
inputs=[input_x, input_y],
stochastic_expected_outputs=[sampled_data],
rtol=1e-2,
)
)

elif args.compile_mode == "compile-only":
backend = XRTBackend(
verbose=args.verbose,
omit_while_true_loop=False,
output_format=args.output_format,
runtime_loop_tiling_sizes=[4, 4],
)
module_function = backend.compile(mlir_module)
backend.unload()
)
Loading
Loading