amd
diff --git a/‎examples/matmul/matmul.py‎ ‎…6_m64_n64_k64/matmul_bf16_m64_n64_k64.py‎examples/matmul/matmul.py renamed to examples/matmul_bf16_m64_n64_k64/matmul_bf16_m64_n64_k64.py
Lines changed: 3 additions & 1 deletion b/‎examples/matmul/matmul.py‎ ‎…6_m64_n64_k64/matmul_bf16_m64_n64_k64.py‎examples/matmul/matmul.py renamed to examples/matmul_bf16_m64_n64_k64/matmul_bf16_m64_n64_k64.py
Lines changed: 3 additions & 1 deletion
diff --git a/‎examples/matmul/transform_aie2.mlir‎ ‎…mul_bf16_m64_n64_k64/transform_aie2.mlir‎examples/matmul/transform_aie2.mlir renamed to examples/matmul_bf16_m64_n64_k64/transform_aie2.mlir
Lines changed: 66 additions & 135 deletions b/‎examples/matmul/transform_aie2.mlir‎ ‎…mul_bf16_m64_n64_k64/transform_aie2.mlir‎examples/matmul/transform_aie2.mlir renamed to examples/matmul_bf16_m64_n64_k64/transform_aie2.mlir
Lines changed: 66 additions & 135 deletions
diff --git a/‎examples/matmul/transform_aie2p.mlir‎ ‎…ul_bf16_m64_n64_k64/transform_aie2p.mlir‎examples/matmul/transform_aie2p.mlir renamed to examples/matmul_bf16_m64_n64_k64/transform_aie2p.mlir
Lines changed: 65 additions & 134 deletions b/‎examples/matmul/transform_aie2p.mlir‎ ‎…ul_bf16_m64_n64_k64/transform_aie2p.mlir‎examples/matmul/transform_aie2p.mlir renamed to examples/matmul_bf16_m64_n64_k64/transform_aie2p.mlir
Lines changed: 65 additions & 134 deletions
diff --git a/‎examples/padded_matmul/padded_matmul.py‎ ‎…ul_f32_m64_n32_k16_padded_atransposed.py‎examples/padded_matmul/padded_matmul.py renamed to examples/matmul_f32_m64_n32_k16_padded_atransposed/matmul_f32_m64_n32_k16_padded_atransposed.py b/‎examples/padded_matmul/padded_matmul.py‎ ‎…ul_f32_m64_n32_k16_padded_atransposed.py‎examples/padded_matmul/padded_matmul.py renamed to examples/matmul_f32_m64_n32_k16_padded_atransposed/matmul_f32_m64_n32_k16_padded_atransposed.py
diff --git a/‎…mples/padded_matmul/transform_aie2p.mlir‎ ‎…_padded_atransposed/transform_aie2p.mlir‎examples/padded_matmul/transform_aie2p.mlir renamed to examples/matmul_f32_m64_n32_k16_padded_atransposed/transform_aie2p.mlir
Lines changed: 22 additions & 29 deletions b/‎…mples/padded_matmul/transform_aie2p.mlir‎ ‎…_padded_atransposed/transform_aie2p.mlir‎examples/padded_matmul/transform_aie2p.mlir renamed to examples/matmul_f32_m64_n32_k16_padded_atransposed/transform_aie2p.mlir
Lines changed: 22 additions & 29 deletions
diff --git a/‎examples/matmul_i8_m128_n64_k64/matmul_i8_m128_n64_k64.py‎
Lines changed: 98 additions & 0 deletions b/‎examples/matmul_i8_m128_n64_k64/matmul_i8_m128_n64_k64.py‎
Lines changed: 98 additions & 0 deletions
@@ -87,7 +87,9 @@ def bench_matmul(M, N, K, provider):
 
 if __name__ == "__main__":
     benchmark.select_npu_backend()
-    for M in [2**i for i in range(8, 14, 2)]:  # change to "in range(9, 14, 2)" if BLOCK_SIZE_M=512
+    for M in [
+        2**i for i in range(8, 14, 2)
+    ]:  # change to "in range(9, 14, 2)" if BLOCK_SIZE_M=512
         for N in [2**i for i in range(8, 14, 2)]:
             for K in [2**i for i in range(8, 14, 2)]:
                 bench_matmul(M, N, K, "test")
 
@@ -1,41 +1,33 @@
-// Transform Script for F32 Matmul with BF16 Emulation
+// Auto-generated by matmul_transform.py — do not edit manually.
+// Parameters: l1_m=64, l1_n=32, l2_k=16, pack=[8,8,8], accum=f32, contract_in=bf16
 //
-// Starting IR: Full-K matmul (no K-loop), all f32, generated from asm_src params.
-//   - func @matmul_padding_kernel(memref<*xf32>*3, i32*6)
-//   - linalg.matmul(64xK @ Kx32 → 64x32), f32 accumulation
-//   - A in K×M layout (strides [1, M_alloc]), B in K×N (strides [N_alloc, 1])
-//
-// Follows test 53's transform pattern: tile copies, pack [8,8,8], tile K,
-// tile forall for multi-core, vectorize, hoist.
-//
-// Target: 4×8 AIE core array (Strix/NPU2), BF16 emulation
-// Tile sizes: M=64, N=32, K_L2=16, pack [8,8,8]
+// Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved.
+// SPDX-License-Identifier: MIT
 
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
 
     //==========================================================================
-    // PHASE 1: TILE L3→L2 MEMORY COPIES
+    // PHASE 1: TILE L3->L2 MEMORY COPIES
+    // Tile memref copies for streaming data from DDR (L3) to MemTile (L2).
     //==========================================================================
 
         %func10 = transform.structured.match ops{["func.func"]} in %arg1  : (!transform.any_op) -> !transform.any_op
         %func10_updated = transform.air.convert_memref_copy_to_linalg_copy %func10 : (!transform.any_op) -> !transform.any_op
         %copies = transform.structured.match ops{["linalg.copy"]} in %arg1  : (!transform.any_op) -> !transform.any_op
         %copy1, %copy2 = transform.split_handle %copies : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
-        // Tile A copy: 64×K → 64×16 tiles (K_L2_TILE=16)
         %tiled_copy1, %tile_copy_loop1 =
           transform.structured.tile_using_for %copy1 tile_sizes [0, 16]
           : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
         transform.annotate %tile_copy_loop1 "copy_a_loop" : !transform.any_op
-        // Tile B copy: K×32 → 16×32 tiles
         %tiled_copy2, %tile_copy_loop2 =
           transform.structured.tile_using_for %copy2 tile_sizes [16]
           : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
         transform.annotate %tile_copy_loop2 "copy_b_loop" : !transform.any_op
 
     //==========================================================================
     // PHASE 2: PROMOTE OUTPUT TO L2
-    // No truncf fusion needed (output is f32).
+    // Allocate output buffer (C) in L2 for accumulation.
     //==========================================================================
 
         %result_l2 = transform.structured.match ops{["linalg.fill"]} in %arg1 : (!transform.any_op) -> !transform.any_op
@@ -44,43 +36,47 @@ module attributes {transform.with_named_sequence} {
 
     //==========================================================================
     // PHASE 3: PACK MATMUL FOR VECTORIZED COMPUTATION
-    // Pack sizes [8, 8, 8] for M, N, K dimensions.
+    // Pack [8, 8, 8], transpose A/B/C, promote C pack to L1.
     //==========================================================================
 
         %matmul_to_pack = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!transform.any_op) -> !transform.any_op
         %packed = transform.structured.pack %matmul_to_pack packed_sizes = [8, 8, 8]
           : (!transform.any_op) -> (!transform.any_op)
 
+        // Transpose A: outer_perm [1,0]
         %pack_producer_a = transform.get_producer_of_operand %packed[0]
           : (!transform.any_op) -> (!transform.any_op)
         %packed_a, %pack_a, %empty_unpack_a =
           transform.structured.pack_transpose %pack_producer_a with_compute_op(%packed)
           outer_perm = [1, 0] : (!transform.any_op, !transform.any_op)
           -> (!transform.any_op, !transform.any_op, !transform.any_op)
 
+        // Transpose B: outer_perm [1,0] + inner_perm [1,0]
         %pack_producer_b = transform.get_producer_of_operand %packed_a[1]
           : (!transform.any_op) -> (!transform.any_op)
         %packed_b, %pack_b, %empty_unpack_b =
           transform.structured.pack_transpose %pack_producer_b with_compute_op(%packed_a)
           outer_perm = [1, 0] inner_perm = [1, 0] : (!transform.any_op, !transform.any_op)
           -> (!transform.any_op, !transform.any_op, !transform.any_op)
 
+        // Transpose C: outer_perm [1,0]
         %unpack = transform.get_consumers_of_result %packed_b[0]
           : (!transform.any_op) -> (!transform.any_op)
         %packed_c, %pack_c, %unpack_c =
           transform.structured.pack_transpose %unpack with_compute_op(%packed_b)
           outer_perm = [1, 0] : (!transform.any_op, !transform.any_op)
           -> (!transform.any_op, !transform.any_op, !transform.any_op)
 
+        // Promote C pack to L1
         %output_l1_pack_op_source_buffer, %output_l1_pack_op_new = transform.structured.bufferize_to_allocation %pack_c
             {memory_space = 2, bufferize_destination_only, memcpy_op = "linalg.copy", emit_dealloc} : !transform.any_op
 
-        // Annotate the packed matmul so we can find it after K-tiling
+        // Annotate for robust matching after K-tiling
         transform.annotate %packed_c "packed_matmul" : !transform.any_op
 
     //==========================================================================
     // PHASE 4: TILE K REDUCTION AND FUSE PACK OPERATIONS
-    // K/8 packed K-dim. Tile by 2 (= 16 raw K elements = K_L2_TILE).
+    // Tile packed K dim by 2 (= 16 raw K elements).
     //==========================================================================
 
         %tiled_reduction, %outer_for_loop =
@@ -93,9 +89,7 @@ module attributes {transform.with_named_sequence} {
 
     //==========================================================================
     // PHASE 5: TILE FOR MULTI-CORE PARALLELISM
-    // Packed C dims after pack [8,8,8] + outer_perm [1,0]:
-    //   [N/8, M/8, K/8] = [16, 32, K/8] → tile [8, 4, 0] → forall(2, 8)
-    //   par_to_herd maps to herd(8, 2) → collapse to 4×4
+    // Tile [8, 4, 0] for herd distribution.
     //==========================================================================
 
         %matmul_1 = transform.structured.match ops{["linalg.generic"]} attributes{packed_matmul} in %arg1 : (!transform.any_op) -> !transform.any_op
@@ -119,15 +113,13 @@ module attributes {transform.with_named_sequence} {
     // PHASE 6: PROMOTE INPUTS TO L1 AND TILE PROLOGUE/EPILOGUE
     //==========================================================================
 
+        // Promote A and B to L1
         %buffer_a, %new_a = transform.structured.bufferize_to_allocation %fused_lhs_l1_pack2
           {memory_space = 2, bufferize_destination_only, emit_dealloc} : !transform.any_op
         %buffer_b, %new_b = transform.structured.bufferize_to_allocation %fused_rhs_l1_pack2
           {memory_space = 2, bufferize_destination_only, emit_dealloc} : !transform.any_op
 
-    // Prologue: fill → generalize → interchange → tile_using_forall
-    // After packing, fill is on packed 4D tensor [N/8, M/8, 8, 8] = [16, 32, 8, 8].
-    // Interchange [1,0,2,3] swaps N/M dims → [32, 16, 8, 8].
-    // Tile [8, 4] → forall(4, 4) matching herd.
+        // Prologue: fill -> generalize -> interchange -> tile for herd
         %fill_op = transform.structured.match ops{["linalg.fill"]} in %arg1 : (!transform.any_op) -> !transform.any_op
         %generic_fill_op = transform.structured.generalize %fill_op
             : (!transform.any_op) -> !transform.any_op
@@ -140,7 +132,7 @@ module attributes {transform.with_named_sequence} {
             : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
         transform.annotate %prologue_forall "prologue_forall" : !transform.any_op
 
-    // Epilogue: unpack → tile_using_forall [64, 32] for 4×4 herd
+        // Epilogue: unpack -> tile for L2 write-back
         %unpack_op = transform.structured.match ops{["linalg.unpack"]} in %arg1 : (!transform.any_op) -> !transform.any_op
         %epilogue_tiled_unpack, %epilogue_forall =
           transform.structured.tile_using_forall %unpack_op tile_sizes [64, 32]
@@ -195,8 +187,6 @@ module attributes {transform.with_named_sequence} {
 
         %generic1 = transform.structured.match ops{["linalg.generic"]} attributes{init_fill} in %arg1 : (!transform.any_op) -> !transform.any_op
         %generic2 = transform.structured.match ops{["linalg.generic"]} attributes{matmul_compute} in %arg1 : (!transform.any_op) -> !transform.any_op
-        // Per-core packed matmul: [4, 8, K/8, 8, 8, 8].
-        // Tile for vectorization: [2, 2, 1, 0, 0, 0] then unroll.
         %inner_most_generics, %vec_loops:3 =
           transform.structured.tile_using_for %generic2 tile_sizes [2, 2, 1, 0, 0, 0]
           : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
@@ -252,9 +242,12 @@ module attributes {transform.with_named_sequence} {
         %scf_fors_1 = transform.structured.match ops{["scf.for"]} in %herd2_1 : (!transform.any_op) -> !transform.any_op
         %innermost_for, %outer_fors = transform.split_handle %scf_fors_1 {overflow_result = 1} : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 
-        // Cast vector.contract input types: inputs 0,1 to bf16, accumulator 2 and output to f32
+        // Cast accumulator (input[2]) and output[0] to f32
         %vector_contracts = transform.structured.match ops{["vector.contract"]} in %arg1 : (!transform.any_op) -> !transform.any_op
         %result11 = transform.air.vector_type_cast %vector_contracts {target_element_type = f32, input_indices = [2], output_indices = [0]} : (!transform.any_op) -> !transform.any_op
+
+        // Cast vector.contract inputs 0,1 to bf16
+        // (matches hardware MAC unit native input type)
         %vector_contracts_2 = transform.structured.match ops{["vector.contract"]} in %arg1 : (!transform.any_op) -> !transform.any_op
         %result11b = transform.air.vector_type_cast %vector_contracts_2 {target_element_type = bf16, input_indices = [0, 1], output_indices = []} : (!transform.any_op) -> !transform.any_op
 
 
@@ -0,0 +1,98 @@
+# Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved.
+# SPDX-License-Identifier: MIT
+
+# INT8 matmul with l1_m=128, l1_n=64, l2_k=64.
+# L1 budget (with pingpong): 16K(2*A) + 8K(2*B) + 32K(C) = 56KB / 64KB (88%).
+# BLOCK_SIZE_M=1024, BLOCK_SIZE_N=256 to fit 8x4 herd with per-core 128x64.
+#
+# Transform script generated by:
+#   python examples/matmul_transform.py --l1-m 128 --l1-n 64 --l2-k 64 \
+#       --pack-sizes 8 8 8 --accum-type i32 --contract-input-type i16 \
+#       -o examples/matmul_i8_m128_n64_k64/transform_aie2p.mlir
+
+import torch
+import triton
+import triton.language as tl
+import sys, os
+
+sys.path.append(os.path.abspath(".."))
+import benchmark
+
+
+@triton.jit
+def bare_matmul_i8(
+    A,
+    B,
+    C,
+    M: tl.constexpr,
+    N: tl.constexpr,
+    K: tl.constexpr,
+    stride_am: tl.constexpr,
+    stride_ak: tl.constexpr,
+    stride_bk: tl.constexpr,
+    stride_bn: tl.constexpr,
+    stride_cm: tl.constexpr,
+    stride_cn: tl.constexpr,
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+):
+    pid_m = tl.program_id(0)
+    pid_n = tl.program_id(1)
+
+    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+
+    a_block = tl.load(A + offs_m[:, None] * stride_am + offs_k[None, :] * stride_ak)
+    b_block = tl.load(B + offs_k[:, None] * stride_bk + offs_n[None, :] * stride_bn)
+
+    c_block = tl.dot(a_block, b_block)
+
+    tl.store(C + offs_m[:, None] * stride_cm + offs_n[None, :] * stride_cn, c_block)
+
+
+def bench_matmul_i8(M, N, K, provider):
+    device = "cpu"
+    dtype_in = torch.int8
+    dtype_out = torch.int32
+    a = torch.randint(-8, 8, (M, K), device=device, dtype=dtype_in)
+    b = torch.randint(-8, 8, (K, N), device=device, dtype=dtype_in)
+    c = torch.empty((M, N), device=device, dtype=dtype_out)
+    if provider == "torch" or provider == "test":
+        c_ref = torch.matmul(a.to(dtype_out), b.to(dtype_out))
+    if provider == "triton" or provider == "test":
+        grid = lambda META: (
+            triton.cdiv(M, META["BLOCK_SIZE_M"]),
+            triton.cdiv(N, META["BLOCK_SIZE_N"]),
+        )
+        compiled_kernel = bare_matmul_i8[grid](
+            a,
+            b,
+            c,
+            M,
+            N,
+            K,
+            a.stride(0),
+            a.stride(1),
+            b.stride(0),
+            b.stride(1),
+            c.stride(0),
+            c.stride(1),
+            BLOCK_SIZE_M=1024,
+            BLOCK_SIZE_N=256,
+            BLOCK_SIZE_K=K,
+        )
+        with open("tt.shared.mlir", "w") as f:
+            f.write(str(compiled_kernel.asm["ttsharedir"]))
+        if provider == "test":
+            torch.testing.assert_close(c, c_ref, atol=0, rtol=0)
+
+
+if __name__ == "__main__":
+    benchmark.select_npu_backend()
+    for M in [1024, 2048, 4096]:
+        for N in [1024, 2048]:
+            for K in [256, 512, 1024]:
+                bench_matmul_i8(M, N, K, "test")
+                bench_matmul_i8(M, N, K, "test")