From 672f2e3c96f4bc6e03a6d67668abcf87b5437c13 Mon Sep 17 00:00:00 2001
From: James Newling <james.newling@gmail.com>
Date: Fri, 28 Feb 2025 09:00:00 -0800
Subject: [PATCH] first commit

---
 .../matmul_template/matmul_generator.py       | 17 ++++++-
 .../matmul_trunci_scaling_MxK_KxN.mlir        | 13 ++---
 build_tools/ci/cpu_comparison/run.py          | 50 +++++++++++--------
 3 files changed, 53 insertions(+), 27 deletions(-)

diff --git a/build_tools/ci/cpu_comparison/matmul_template/matmul_generator.py b/build_tools/ci/cpu_comparison/matmul_template/matmul_generator.py
index f02e17023..6511b0b83 100644
--- a/build_tools/ci/cpu_comparison/matmul_template/matmul_generator.py
+++ b/build_tools/ci/cpu_comparison/matmul_template/matmul_generator.py
@@ -10,7 +10,19 @@ def get_higher_order_element_type(element_type):
 
 
 def generate_matmul_test(
-    output_fn, input_fn, m, n, k, lhs_rhs_type, acc_type, b=0, m0=0, n0=0, k0=0
+    output_fn,
+    input_fn,
+    m,
+    n,
+    k,
+    lhs_rhs_type,
+    acc_type,
+    b=0,
+    m0=0,
+    n0=0,
+    k0=0,
+    trunci_scale=None,
+    trunci_shift=None,
 ):
     """
     Generate mlir file (output_fn) from the template file (input_fn).
@@ -34,6 +46,9 @@ def generate_matmul_test(
     # This is only used for batch matmul.
     replace["B"] = b
 
+    replace["TRUNCI_SCALE"] = trunci_scale
+    replace["TRUNCI_SHIFT"] = trunci_shift
+
     # m0, n0, k0 are only used for matmul4d as inner dim sizes.
     replace["M0"] = m0
     replace["N0"] = n0
diff --git a/build_tools/ci/cpu_comparison/matmul_template/matmul_trunci_scaling_MxK_KxN.mlir b/build_tools/ci/cpu_comparison/matmul_template/matmul_trunci_scaling_MxK_KxN.mlir
index 5ed4a849b..8759d8938 100644
--- a/build_tools/ci/cpu_comparison/matmul_template/matmul_trunci_scaling_MxK_KxN.mlir
+++ b/build_tools/ci/cpu_comparison/matmul_template/matmul_trunci_scaling_MxK_KxN.mlir
@@ -2,15 +2,16 @@
 // input ${K}x${N}x${TYPE1}
 
 // Matmul + Trunci variant with scaling.
-// In an actual quantized model, truncating from a higher bitwidth to a lower precision bitwidth
-// won't work and we need to scale.
-// Since the output of the Matmul here is an integer cannot be multiplied with a floating point
-// scale factor, we need to represent the scale factor with a multiplier and a shift operator instead.
+// In an actual quantized model, truncating from a higher bitwidth to a lower
+// precision bitwidth won't work and we need to scale. Since the output of the
+// matmul here is an integer cannot be multiplied with a floating point
+// scale factor, we need to represent the scale factor with a multiplier and a
+// shift operator instead.
 func.func @matmul_trunci(%arg0: tensor<${M}x${K}x${TYPE1}>, %arg1: tensor<${K}x${N}x${TYPE1}>) -> tensor<${M}x${N}x${TYPE1}>
 {
   %cst = arith.constant ${ZERO} : ${TYPE2}
-  %cst_mul = arith.constant 10 : ${TYPE_MUL_RESULT}
-  %cst_shift = arith.constant 7 : ${TYPE_MUL_RESULT}
+  %cst_mul = arith.constant ${TRUNCI_SCALE} : ${TYPE_MUL_RESULT}
+  %cst_shift = arith.constant ${TRUNCI_SHIFT} : ${TYPE_MUL_RESULT}
   %0 = tensor.empty() : tensor<${M}x${N}x${TYPE2}>
   %i8out = tensor.empty() : tensor<${M}x${N}x${TYPE1}>
   %1 = linalg.fill ins(%cst : ${TYPE2}) outs(%0 : tensor<${M}x${N}x${TYPE2}>) -> tensor<${M}x${N}x${TYPE2}>
diff --git a/build_tools/ci/cpu_comparison/run.py b/build_tools/ci/cpu_comparison/run.py
index f081a2b4f..ccbb9bd9f 100755
--- a/build_tools/ci/cpu_comparison/run.py
+++ b/build_tools/ci/cpu_comparison/run.py
@@ -733,10 +733,9 @@ def __init__(
         K,
         input_type,
         acc_type,
-        lhs,
-        rhs,
-        expected_out,
         test_params=None,
+        trunci_scale=10,
+        trunci_shift=7,
     ):
         super().__init__(
             name=f"matmul_scale_trunci_{M}_{N}_{K}_{input_type}_{acc_type}",
@@ -749,23 +748,42 @@ def __init__(
         )
         self.labels.append("MatmulScaleTrunci")
 
-        # Assertions on shapes: Check that lhs is MxK, rhs is KxN, and expected_out is MxN
-        assert lhs.shape == (M, K)
-        assert rhs.shape == (K, N)
-        assert expected_out.shape == (M, N)
+        self.trunci_scale = trunci_scale
+        self.trunci_shift = trunci_shift
+        self.lhs = np.random.randint(0, 3, (self.M, self.K)).astype(np.int8)
+        self.rhs = np.random.randint(0, 3, (self.K, self.N)).astype(np.int8)
+        self.expected_out = np.right_shift(
+            (
+                (self.lhs.astype(np.int32) @ self.rhs.astype(np.int32))
+                * self.trunci_scale
+            ),
+            self.trunci_shift,
+        ).astype(np.int8)
 
-        self.lhs = lhs
-        self.rhs = rhs
-        self.expected_out = expected_out
+        # Assertions on shapes: Check that lhs is MxK, rhs is KxN, and expected_out is MxN
+        assert self.expected_out.shape == (M, N)
 
     def _execute(self, config):
         matmul_template_dir = config.file_dir / "matmul_template"
         template_name = matmul_template_dir / "matmul_trunci_scaling_MxK_KxN.mlir"
-        self.generate(config, template_name)
+
+        generate_matmul_test(
+            self.get_filename(config),
+            template_name,
+            k=self.K,
+            m=self.M,
+            n=self.N,
+            lhs_rhs_type=self.input_type,
+            acc_type=self.acc_type,
+            trunci_scale=self.trunci_scale,
+            trunci_shift=self.trunci_shift,
+        )
+
         filename = self.get_filename(config)
         input_args = generate_inputs(
             filename, self.get_dir(config), 1, {1: self.lhs, 2: self.rhs}
         )
+
         aie_vs_baseline(
             config=config,
             aie_compilation_flags=self.aie_compilation_flags,
@@ -1576,9 +1594,6 @@ def __init__(self):
                 128,
                 "i8",
                 "i32",
-                2 * np.ones([256, 128], dtype=np.int8),
-                3 * np.ones([128, 256], dtype=np.int8),
-                60 * np.ones([256, 256], dtype=np.int8),
                 test_params=TestParams(
                     name_suffix="scaling",
                     tile_pipeline="pack-peel-4-level-tiling",
@@ -1599,9 +1614,6 @@ def __init__(self):
                 128,
                 "i8",
                 "i32",
-                2 * np.ones([256, 128], dtype=np.int8),
-                3 * np.ones([128, 256], dtype=np.int8),
-                60 * np.ones([256, 256], dtype=np.int8),
                 test_params=TestParams(
                     tile_pipeline="pack-peel-4-level-tiling",
                     run_on_target=["npu1_4col"],
@@ -1620,9 +1632,6 @@ def __init__(self):
                 128,
                 "i8",
                 "i32",
-                2 * np.ones([256, 128], dtype=np.int8),
-                3 * np.ones([128, 256], dtype=np.int8),
-                60 * np.ones([256, 256], dtype=np.int8),
                 test_params=TestParams(
                     tile_pipeline="pack-peel-4-level-tiling",
                     run_on_target=["npu4"],
@@ -1636,6 +1645,7 @@ def __init__(self):
                 ),
             )
         )
+
         # Matmul with truncf test(s):
         for tile_pipeline in ["pack-peel", "pack-peel-4-level-tiling"]:
             self.register(