Merge remote-tracking branch 'origin/fc/astral-fix-3x3' into lg/scarv

LuigiGhionda · LuigiGhionda · commit 2a336cc4cfbc · 2026-02-04T23:58:10.000+01:00
diff --git a/neureka/hal/neureka_task.c b/neureka/hal/neureka_task.c
@@ -166,14 +166,16 @@ void neureka_task_set_strides(neureka_task_t *task, const uint32_t k_in,
                                           .d2 = h_out_stride};
   task->data.cfg.output_stride = output_stride;
 
-  task->data.cfg.weights_stride.d0 = NEUREKA_WEIGHT_BANDWIDTH_BYTES;
   if (task->kernel_shape == 1) { // 1x1
+    task->data.cfg.weights_stride.d0 = NEUREKA_WEIGHT_BANDWIDTH_BYTES_1x1;
     task->data.cfg.weights_stride.d1 =
-        NEUREKA_WEIGHT_BANDWIDTH_BYTES * num_k_in;
+        (NEUREKA_WEIGHT_BANDWIDTH_BYTES_1x1 / 8) * task->qw * num_k_in;
   } else if (!task->depthwise) { // 3x3
+    task->data.cfg.weights_stride.d0 = NEUREKA_WEIGHT_BANDWIDTH_BYTES_3x3;
     task->data.cfg.weights_stride.d1 =
-        NEUREKA_WEIGHT_BANDWIDTH_BYTES * task->qw * num_k_in;
+        NEUREKA_WEIGHT_BANDWIDTH_BYTES_3x3 * task->qw * num_k_in;
   } else { // 3x3 depthwise
+    task->data.cfg.weights_stride.d0 = NEUREKA_WEIGHT_BANDWIDTH_BYTES_3x3;
     task->data.cfg.weights_stride.d1 = 0;
   }
   task->data.cfg.weights_stride.d2 = 0;
diff --git a/neureka/hal/neureka_task_defs.h b/neureka/hal/neureka_task_defs.h
@@ -29,6 +29,8 @@
 #ifndef NNX_NEUREKA_PE_W
   #define NNX_NEUREKA_PE_W (4)
 #endif
+#define NNX_NEUREKA_BANDWIDTH_1x1 (256)
+#define NNX_NEUREKA_BANDWIDTH_3x3 (288)
 
 #define NEUREKA_SUBTILE_INPUT_HEIGHT_1x1 (NNX_NEUREKA_PE_H)
 #define NEUREKA_SUBTILE_INPUT_WIDTH_1x1  (NNX_NEUREKA_PE_W)
@@ -38,12 +40,13 @@
 #define NEUREKA_SUBTILE_INPUT_WIDTH_3x3  (NNX_NEUREKA_PE_W+2)
 #define NEUREKA_SUBTILE_INPUT_CHANNEL_3x3 (32)
 
-#define NEUREKA_SUBTILE_OUTPUT_HEIGHT (4)
-#define NEUREKA_SUBTILE_OUTPUT_WIDTH (4)
+#define NEUREKA_SUBTILE_OUTPUT_HEIGHT (NNX_NEUREKA_PE_H)
+#define NEUREKA_SUBTILE_OUTPUT_WIDTH (NNX_NEUREKA_PE_W)
 #define NEUREKA_SUBTILE_OUTPUT_CHANNEL (32)
 
 #define NEUREKA_OUTPUT_BANDWIDTH_BYTES (32)
-#define NEUREKA_WEIGHT_BANDWIDTH_BYTES (32)
+#define NEUREKA_WEIGHT_BANDWIDTH_BYTES_1x1 (NNX_NEUREKA_BANDWIDTH_1x1/8)
+#define NEUREKA_WEIGHT_BANDWIDTH_BYTES_3x3 (NNX_NEUREKA_BANDWIDTH_3x3/8)
 
 #define NEUREKA_ECC_REGS_NUM (4)
 
diff --git a/test/HeaderWriter.py b/test/HeaderWriter.py
@@ -97,10 +97,10 @@ def render_vector(self, name, size, _type, init=None, elements_per_row=10):
         return retval
 
     def check_declaration(self, name):
-        return f"void check_{name}();\n\n"
+        return f"int check_{name}();\n\n"
 
     def check(self, name):
-        return f"""void check_{name}() {{
+        return f"""int check_{name}() {{
         printf("Checking the {name} vector:\\n");
 
         int n_err = 0;
@@ -115,6 +115,7 @@ def check(self, name):
             printf("> Success! No errors found.\\n");
         else
             printf("> Failure! Found %d/%d errors.\\n", n_err, {name.upper()}_SIZE);
+        return n_err;
     }}
 
     """
diff --git a/test/NeuralEngineFunctionalModel.py b/test/NeuralEngineFunctionalModel.py
@@ -2,13 +2,23 @@
 
 import torch
 import torch.nn.functional as F
+import numpy as np
 
 from TestClasses import IntegerType, Padding, Stride
 
 
 class NeuralEngineFunctionalModel:
     ACCUMULATOR_TYPE = IntegerType(name="int32")
 
+    @staticmethod
+    def _tensor_to_hex(tensor):
+        int_tensor = np.asarray(torch.floor(tensor).to(torch.int64))
+        int_tensor[int_tensor < 0] = 0xffffffff + (int_tensor[int_tensor < 0]+1)
+        hex_tensor = np.empty(int_tensor.shape, dtype=object)
+        for idx in np.ndindex(int_tensor.shape):
+            hex_tensor[idx] = hex(int_tensor[idx].item())
+        return hex_tensor
+
     @staticmethod
     def _cast(
         tensor: torch.Tensor, _type: IntegerType, saturate: bool = False
@@ -36,7 +46,10 @@ def _norm_quant(
 
         if verbose:
             print("INTERMEDIATE RESULTS (after scale):")
-            print(tensor)
+            current_threshold = np.get_printoptions()['threshold']
+            np.set_printoptions(threshold=np.inf)
+            print(NeuralEngineFunctionalModel._tensor_to_hex(tensor))
+            np.set_printoptions(threshold=current_threshold)
 
         if has_bias:
             assert bias is not None
@@ -54,13 +67,23 @@ def _norm_quant(
 
             if verbose:
                 print("INTERMEDIATE RESULTS (after bias):")
-                print(tensor)
+                current_threshold = np.get_printoptions()['threshold']
+                np.set_printoptions(threshold=np.inf)
+                print(NeuralEngineFunctionalModel._tensor_to_hex(tensor))
+                np.set_printoptions(threshold=current_threshold)
 
         if has_relu:
             tensor = F.relu(tensor)
 
         tensor = tensor >> global_shift
 
+        if verbose:
+            print("INTERMEDIATE RESULTS (after shift):")
+            current_threshold = np.get_printoptions()['threshold']
+            np.set_printoptions(threshold=np.inf)
+            print(NeuralEngineFunctionalModel._tensor_to_hex(tensor))
+            np.set_printoptions(threshold=current_threshold)
+
         # Saturate into out_type
         tensor = NeuralEngineFunctionalModel._cast(tensor, out_type, saturate=True)
 
@@ -98,6 +121,15 @@ def convolution(
             0,
         )
 
+        if verbose:
+            print("INPUTS (padded):")
+            current_threshold = np.get_printoptions()['threshold']
+            np.set_printoptions(threshold=np.inf)
+            print(NeuralEngineFunctionalModel._tensor_to_hex(input_padded))
+            print("WEIGHTS (padded):")
+            print(NeuralEngineFunctionalModel._tensor_to_hex(weight))
+            np.set_printoptions(threshold=current_threshold)
+
         # Accumulators are 32bit non-saturating.
         # Calculate in higher precision (int64)
         output = F.conv2d(
@@ -114,7 +146,10 @@ def convolution(
 
         if verbose:
             print("INTERMEDIATE RESULTS (pre-normalization/requant):")
-            print(output)
+            current_threshold = np.get_printoptions()['threshold']
+            np.set_printoptions(threshold=np.inf)
+            print(NeuralEngineFunctionalModel._tensor_to_hex(output))
+            np.set_printoptions(threshold=current_threshold)
 
         if has_norm_quant:
             assert scale is not None
diff --git a/test/NeurekaMemoryLayout.py b/test/NeurekaMemoryLayout.py
@@ -22,9 +22,10 @@
 
 
 class NeurekaMemoryLayout:
-    _WEIGHT_BANDWIDTH = 256
+    _WEIGHT_BANDWIDTH_1x1 = 256
+    _WEIGHT_BANDWIDTH_3x3 = 288
     _CIN_SUBTILE_1x1 = 32
-    _CIN_SUBTILE_3x3 = 28
+    _CIN_SUBTILE_3x3 = 32
 
     @staticmethod
     def weightEncode(
@@ -79,35 +80,19 @@ def weightEncode(
             # (-1, Weight Bandwidth)
             weight = np.pad(
                 weight,
-                ((0, 0), (0, NeurekaMemoryLayout._WEIGHT_BANDWIDTH - weight.shape[-1])),
+                ((0, 0), (0, NeurekaMemoryLayout._WEIGHT_BANDWIDTH_3x3 - weight.shape[-1])),
                 "constant",
                 constant_values=0,
             )
+            weightBandwidthBytes = int(np.ceil(NeurekaMemoryLayout._WEIGHT_BANDWIDTH_3x3 / 8))
         elif height == 1 and width == 1:
-            # Tile cinSubtile into tiles of size 4
-            # (cout, cinMajor, Bits, Flattened spatial, cinSubtileMajor, cinSubtileTile)
-            weight = weight.reshape(
-                cout, cinMajor, bits, height * width, cinSubtile // 4, 4
-            )  # cout, cinMajor, bits, 1, 8, 4
-            # Pad bits to 8
-            if bits < 8:
-                # (cout, cinMajor, PaddedBits, Flattened spatial, cinSubtileMajor, cinSubtileTile)
-                weight = np.pad(
-                    weight,
-                    ((0, 0), (0, 0), (0, 8 - bits), (0, 0), (0, 0), (0, 0)),
-                    mode="constant",
-                    constant_values=0,
-                )
-            # (cout, cinMajor, Flattened spatial, cinSubtileMajor, PaddedBits, cinSubtileTile)
-            weight = weight.transpose(0, 1, 3, 4, 2, 5)
-            # (-1, Weight Bandwidth)
-            weight = weight.reshape(
-                cout * cinMajor, NeurekaMemoryLayout._WEIGHT_BANDWIDTH
-            )  # cout*cinMajor, 256b
+            # (cout * cinMajor, Bits * cinSubtile)
+            weight = weight.reshape(-1, bits * cinSubtile)
+            # No padding needed here
+            weightBandwidthBytes = int(np.ceil(bits * cinSubtile / 8))
 
         # Prepare for packing
         # (-1, Weight Bandwidth Bytes, 8)
-        weightBandwidthBytes = int(np.ceil(NeurekaMemoryLayout._WEIGHT_BANDWIDTH / 8))
         weight = np.stack(np.split(weight, weightBandwidthBytes, axis=-1), axis=-2)
 
         # Pack bits
diff --git a/test/NeurekaMemoryLayoutSiracusa.py b/test/NeurekaMemoryLayoutSiracusa.py
@@ -0,0 +1,156 @@
+# Luka Macan <luka.macan@unibo.it>
+# Arpan Suravi Prasad <prasadar@iis.ee.ethz.ch>
+#
+# Copyright 2023 ETH Zurich and University of Bologna
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import numpy as np
+import numpy.typing as npt
+
+
+class NeurekaMemoryLayoutSiracusa:
+    _WEIGHT_BANDWIDTH = 256
+    _CIN_SUBTILE_1x1 = 32
+    _CIN_SUBTILE_3x3 = 28
+
+    @staticmethod
+    def weightEncode(
+        weight: npt.NDArray[np.uint8], bits: int, depthwise: bool = False
+    ) -> npt.NDArray[np.uint8]:
+        """Unroll weight into expected memory format
+
+        Expected weight shape is (cout, cin, H, W).
+        The produced memory layout depends on the weight kernel shape:
+          - 3x3: (cout, cinMajor, Bits, H x W x cinMinor_3x3 packed into Weight Bandwidth bits),
+          - 1x1: (cout, cinMajor, Bits x H x W x cinMinor_1x1 packed into Weight Bandwidth bits),
+        where cinMajor is the ceil(cin / cin subtile <mode>) and cinMinor has to be padded with 0 to cin subtile <mode>.
+        """
+        if depthwise:
+            weight = weight.transpose(1, 0, 2, 3)  # Swap cout and cin
+
+        cout, cin, height, width = weight.shape
+        cinSubtile = (
+            NeurekaMemoryLayoutSiracusa._CIN_SUBTILE_3x3
+            if height == 3
+            else NeurekaMemoryLayoutSiracusa._CIN_SUBTILE_1x1
+        )
+
+        # Pad cin to be divisible with CIN_SUBTILE
+        if cin % cinSubtile != 0:
+            cinPad = cinSubtile - cin % cinSubtile
+            weight = np.pad(
+                weight,
+                ((0, 0), (0, cinPad), (0, 0), (0, 0)),
+                "constant",
+                constant_values=0,
+            )
+
+        # Reshape into (cout, cinMajor, cinMinor, Flattened spatial, 1)
+        # The 1 at the end is required by the unpacking
+        cinMajor = int(np.ceil(cin / cinSubtile))
+        weight = weight.reshape(cout, cinMajor, cinSubtile, height * width, 1)
+
+        # Unpack 'bits' bits in little order, e.g. bits=4: 3 => [1, 1, 0, 0]
+        # (cout, cinMajor, cinSubtile, Flattened spatial, Bits)
+        weight = np.unpackbits(weight, axis=-1, count=bits, bitorder="little")
+
+        # Shuffle bits so that the final shape is:
+        # (cout, cinMajor, Bits, Flattened spatial, cinSubtile)
+        weight = weight.transpose(0, 1, 4, 3, 2)
+
+        # Pack dimensions to fit into weight bandwidth
+        if height == 3 and width == 3:
+            # (cout * cinMajor * Bits, H * W * cinSubtile)
+            weight = weight.reshape(-1, height * width * cinSubtile)
+            # Pad only the last dimension to weight bandwidth size
+            # (-1, Weight Bandwidth)
+            weight = np.pad(
+                weight,
+                ((0, 0), (0, NeurekaMemoryLayoutSiracusa._WEIGHT_BANDWIDTH - weight.shape[-1])),
+                "constant",
+                constant_values=0,
+            )
+        elif height == 1 and width == 1:
+            # Tile cinSubtile into tiles of size 4
+            # (cout, cinMajor, Bits, Flattened spatial, cinSubtileMajor, cinSubtileTile)
+            weight = weight.reshape(
+                cout, cinMajor, bits, height * width, cinSubtile // 4, 4
+            )  # cout, cinMajor, bits, 1, 8, 4
+            # Pad bits to 8
+            if bits < 8:
+                # (cout, cinMajor, PaddedBits, Flattened spatial, cinSubtileMajor, cinSubtileTile)
+                weight = np.pad(
+                    weight,
+                    ((0, 0), (0, 0), (0, 8 - bits), (0, 0), (0, 0), (0, 0)),
+                    mode="constant",
+                    constant_values=0,
+                )
+            # (cout, cinMajor, Flattened spatial, cinSubtileMajor, PaddedBits, cinSubtileTile)
+            weight = weight.transpose(0, 1, 3, 4, 2, 5)
+            # (-1, Weight Bandwidth)
+            weight = weight.reshape(
+                cout * cinMajor, NeurekaMemoryLayoutSiracusa._WEIGHT_BANDWIDTH
+            )  # cout*cinMajor, 256b
+
+        # Prepare for packing
+        # (-1, Weight Bandwidth Bytes, 8)
+        weightBandwidthBytes = int(np.ceil(NeurekaMemoryLayoutSiracusa._WEIGHT_BANDWIDTH / 8))
+        weight = np.stack(np.split(weight, weightBandwidthBytes, axis=-1), axis=-2)
+
+        # Pack bits
+        # (-1, Weight Bandwidth Bytes)
+        weight = np.packbits(weight, axis=-1, bitorder="little")
+
+        return weight.flatten()
+
+    @staticmethod
+    def weightDecode(
+        weight: npt.NDArray[np.uint8],
+        bits: int,
+        cout: int,
+        cin: int,
+        height: int,
+        width: int,
+    ) -> npt.NDArray[np.uint8]:
+        """Reverse of weightEncode"""
+        cinSubtile = (
+            NeurekaMemoryLayoutSiracusa._CIN_SUBTILE_3x3
+            if height == 3
+            else NeurekaMemoryLayoutSiracusa._CIN_SUBTILE_1x1
+        )
+        cinMajor = int(np.ceil(cin / cinSubtile))
+        cinMinor = cinSubtile
+        weightBandwidthBytes = int(np.ceil(NeurekaMemoryLayoutSiracusa._WEIGHT_BANDWIDTH / 8))
+
+        weight = weight.reshape(-1, weightBandwidthBytes, 1)
+        weight = np.unpackbits(weight, axis=-1, count=8, bitorder="little")
+        weight = weight.reshape(-1, NeurekaMemoryLayoutSiracusa._WEIGHT_BANDWIDTH)
+
+        if height == 3 and width == 3:
+            weight = weight[:, : height * width * cinMinor]
+            weight = weight.reshape(
+                cout, cinMajor, bits, height * width, cinMinor
+            ).transpose(0, 1, 4, 3, 2)
+        elif height == 1 and width == 1:
+            weight = weight[:, : height * width * cinMinor * 8]
+            weight = weight.reshape(cout, cinMajor, cinMinor // 4, 8, 4).transpose(
+                0, 1, 2, 4, 3
+            )
+        weight = np.packbits(weight, axis=-1, bitorder="little")
+        weight = weight.reshape(cout, cinMajor * cinMinor, height, width)
+        weight = weight[:, :cin, :, :]
+
+        return weight
diff --git a/test/NeurekaTestConf.py b/test/NeurekaTestConf.py
@@ -65,7 +65,7 @@ def check_valid_out_type(cls, v: IntegerType) -> IntegerType:
     @field_validator("weight_type")
     @classmethod
     def check_valid_weight_type(cls, v: IntegerType) -> IntegerType:
-        NeurekaTestConf._check_type("weight_type", v, ["int8"])
+        NeurekaTestConf._check_type("weight_type", v, ["int8", "int7", "int6", "int5", "int4", "int3", "int2"])
         return v
 
     @field_validator("scale_type")
diff --git a/test/NnxTestClasses.py b/test/NnxTestClasses.py
diff --git a/test/testgen.py b/test/testgen.py