Skip to content

Commit 2a336cc

Browse files
committed
Merge remote-tracking branch 'origin/fc/astral-fix-3x3' into lg/scarv
2 parents cd0ec88 + c20c03c commit 2a336cc

File tree

9 files changed

+283
-50
lines changed

9 files changed

+283
-50
lines changed

neureka/hal/neureka_task.c

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -166,14 +166,16 @@ void neureka_task_set_strides(neureka_task_t *task, const uint32_t k_in,
166166
.d2 = h_out_stride};
167167
task->data.cfg.output_stride = output_stride;
168168

169-
task->data.cfg.weights_stride.d0 = NEUREKA_WEIGHT_BANDWIDTH_BYTES;
170169
if (task->kernel_shape == 1) { // 1x1
170+
task->data.cfg.weights_stride.d0 = NEUREKA_WEIGHT_BANDWIDTH_BYTES_1x1;
171171
task->data.cfg.weights_stride.d1 =
172-
NEUREKA_WEIGHT_BANDWIDTH_BYTES * num_k_in;
172+
(NEUREKA_WEIGHT_BANDWIDTH_BYTES_1x1 / 8) * task->qw * num_k_in;
173173
} else if (!task->depthwise) { // 3x3
174+
task->data.cfg.weights_stride.d0 = NEUREKA_WEIGHT_BANDWIDTH_BYTES_3x3;
174175
task->data.cfg.weights_stride.d1 =
175-
NEUREKA_WEIGHT_BANDWIDTH_BYTES * task->qw * num_k_in;
176+
NEUREKA_WEIGHT_BANDWIDTH_BYTES_3x3 * task->qw * num_k_in;
176177
} else { // 3x3 depthwise
178+
task->data.cfg.weights_stride.d0 = NEUREKA_WEIGHT_BANDWIDTH_BYTES_3x3;
177179
task->data.cfg.weights_stride.d1 = 0;
178180
}
179181
task->data.cfg.weights_stride.d2 = 0;

neureka/hal/neureka_task_defs.h

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,8 @@
2929
#ifndef NNX_NEUREKA_PE_W
3030
#define NNX_NEUREKA_PE_W (4)
3131
#endif
32+
#define NNX_NEUREKA_BANDWIDTH_1x1 (256)
33+
#define NNX_NEUREKA_BANDWIDTH_3x3 (288)
3234

3335
#define NEUREKA_SUBTILE_INPUT_HEIGHT_1x1 (NNX_NEUREKA_PE_H)
3436
#define NEUREKA_SUBTILE_INPUT_WIDTH_1x1 (NNX_NEUREKA_PE_W)
@@ -38,12 +40,13 @@
3840
#define NEUREKA_SUBTILE_INPUT_WIDTH_3x3 (NNX_NEUREKA_PE_W+2)
3941
#define NEUREKA_SUBTILE_INPUT_CHANNEL_3x3 (32)
4042

41-
#define NEUREKA_SUBTILE_OUTPUT_HEIGHT (4)
42-
#define NEUREKA_SUBTILE_OUTPUT_WIDTH (4)
43+
#define NEUREKA_SUBTILE_OUTPUT_HEIGHT (NNX_NEUREKA_PE_H)
44+
#define NEUREKA_SUBTILE_OUTPUT_WIDTH (NNX_NEUREKA_PE_W)
4345
#define NEUREKA_SUBTILE_OUTPUT_CHANNEL (32)
4446

4547
#define NEUREKA_OUTPUT_BANDWIDTH_BYTES (32)
46-
#define NEUREKA_WEIGHT_BANDWIDTH_BYTES (32)
48+
#define NEUREKA_WEIGHT_BANDWIDTH_BYTES_1x1 (NNX_NEUREKA_BANDWIDTH_1x1/8)
49+
#define NEUREKA_WEIGHT_BANDWIDTH_BYTES_3x3 (NNX_NEUREKA_BANDWIDTH_3x3/8)
4750

4851
#define NEUREKA_ECC_REGS_NUM (4)
4952

test/HeaderWriter.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -97,10 +97,10 @@ def render_vector(self, name, size, _type, init=None, elements_per_row=10):
9797
return retval
9898

9999
def check_declaration(self, name):
100-
return f"void check_{name}();\n\n"
100+
return f"int check_{name}();\n\n"
101101

102102
def check(self, name):
103-
return f"""void check_{name}() {{
103+
return f"""int check_{name}() {{
104104
printf("Checking the {name} vector:\\n");
105105
106106
int n_err = 0;
@@ -115,6 +115,7 @@ def check(self, name):
115115
printf("> Success! No errors found.\\n");
116116
else
117117
printf("> Failure! Found %d/%d errors.\\n", n_err, {name.upper()}_SIZE);
118+
return n_err;
118119
}}
119120
120121
"""

test/NeuralEngineFunctionalModel.py

Lines changed: 38 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,23 @@
22

33
import torch
44
import torch.nn.functional as F
5+
import numpy as np
56

67
from TestClasses import IntegerType, Padding, Stride
78

89

910
class NeuralEngineFunctionalModel:
1011
ACCUMULATOR_TYPE = IntegerType(name="int32")
1112

13+
@staticmethod
14+
def _tensor_to_hex(tensor):
15+
int_tensor = np.asarray(torch.floor(tensor).to(torch.int64))
16+
int_tensor[int_tensor < 0] = 0xffffffff + (int_tensor[int_tensor < 0]+1)
17+
hex_tensor = np.empty(int_tensor.shape, dtype=object)
18+
for idx in np.ndindex(int_tensor.shape):
19+
hex_tensor[idx] = hex(int_tensor[idx].item())
20+
return hex_tensor
21+
1222
@staticmethod
1323
def _cast(
1424
tensor: torch.Tensor, _type: IntegerType, saturate: bool = False
@@ -36,7 +46,10 @@ def _norm_quant(
3646

3747
if verbose:
3848
print("INTERMEDIATE RESULTS (after scale):")
39-
print(tensor)
49+
current_threshold = np.get_printoptions()['threshold']
50+
np.set_printoptions(threshold=np.inf)
51+
print(NeuralEngineFunctionalModel._tensor_to_hex(tensor))
52+
np.set_printoptions(threshold=current_threshold)
4053

4154
if has_bias:
4255
assert bias is not None
@@ -54,13 +67,23 @@ def _norm_quant(
5467

5568
if verbose:
5669
print("INTERMEDIATE RESULTS (after bias):")
57-
print(tensor)
70+
current_threshold = np.get_printoptions()['threshold']
71+
np.set_printoptions(threshold=np.inf)
72+
print(NeuralEngineFunctionalModel._tensor_to_hex(tensor))
73+
np.set_printoptions(threshold=current_threshold)
5874

5975
if has_relu:
6076
tensor = F.relu(tensor)
6177

6278
tensor = tensor >> global_shift
6379

80+
if verbose:
81+
print("INTERMEDIATE RESULTS (after shift):")
82+
current_threshold = np.get_printoptions()['threshold']
83+
np.set_printoptions(threshold=np.inf)
84+
print(NeuralEngineFunctionalModel._tensor_to_hex(tensor))
85+
np.set_printoptions(threshold=current_threshold)
86+
6487
# Saturate into out_type
6588
tensor = NeuralEngineFunctionalModel._cast(tensor, out_type, saturate=True)
6689

@@ -98,6 +121,15 @@ def convolution(
98121
0,
99122
)
100123

124+
if verbose:
125+
print("INPUTS (padded):")
126+
current_threshold = np.get_printoptions()['threshold']
127+
np.set_printoptions(threshold=np.inf)
128+
print(NeuralEngineFunctionalModel._tensor_to_hex(input_padded))
129+
print("WEIGHTS (padded):")
130+
print(NeuralEngineFunctionalModel._tensor_to_hex(weight))
131+
np.set_printoptions(threshold=current_threshold)
132+
101133
# Accumulators are 32bit non-saturating.
102134
# Calculate in higher precision (int64)
103135
output = F.conv2d(
@@ -114,7 +146,10 @@ def convolution(
114146

115147
if verbose:
116148
print("INTERMEDIATE RESULTS (pre-normalization/requant):")
117-
print(output)
149+
current_threshold = np.get_printoptions()['threshold']
150+
np.set_printoptions(threshold=np.inf)
151+
print(NeuralEngineFunctionalModel._tensor_to_hex(output))
152+
np.set_printoptions(threshold=current_threshold)
118153

119154
if has_norm_quant:
120155
assert scale is not None

test/NeurekaMemoryLayout.py

Lines changed: 9 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -22,9 +22,10 @@
2222

2323

2424
class NeurekaMemoryLayout:
25-
_WEIGHT_BANDWIDTH = 256
25+
_WEIGHT_BANDWIDTH_1x1 = 256
26+
_WEIGHT_BANDWIDTH_3x3 = 288
2627
_CIN_SUBTILE_1x1 = 32
27-
_CIN_SUBTILE_3x3 = 28
28+
_CIN_SUBTILE_3x3 = 32
2829

2930
@staticmethod
3031
def weightEncode(
@@ -79,35 +80,19 @@ def weightEncode(
7980
# (-1, Weight Bandwidth)
8081
weight = np.pad(
8182
weight,
82-
((0, 0), (0, NeurekaMemoryLayout._WEIGHT_BANDWIDTH - weight.shape[-1])),
83+
((0, 0), (0, NeurekaMemoryLayout._WEIGHT_BANDWIDTH_3x3 - weight.shape[-1])),
8384
"constant",
8485
constant_values=0,
8586
)
87+
weightBandwidthBytes = int(np.ceil(NeurekaMemoryLayout._WEIGHT_BANDWIDTH_3x3 / 8))
8688
elif height == 1 and width == 1:
87-
# Tile cinSubtile into tiles of size 4
88-
# (cout, cinMajor, Bits, Flattened spatial, cinSubtileMajor, cinSubtileTile)
89-
weight = weight.reshape(
90-
cout, cinMajor, bits, height * width, cinSubtile // 4, 4
91-
) # cout, cinMajor, bits, 1, 8, 4
92-
# Pad bits to 8
93-
if bits < 8:
94-
# (cout, cinMajor, PaddedBits, Flattened spatial, cinSubtileMajor, cinSubtileTile)
95-
weight = np.pad(
96-
weight,
97-
((0, 0), (0, 0), (0, 8 - bits), (0, 0), (0, 0), (0, 0)),
98-
mode="constant",
99-
constant_values=0,
100-
)
101-
# (cout, cinMajor, Flattened spatial, cinSubtileMajor, PaddedBits, cinSubtileTile)
102-
weight = weight.transpose(0, 1, 3, 4, 2, 5)
103-
# (-1, Weight Bandwidth)
104-
weight = weight.reshape(
105-
cout * cinMajor, NeurekaMemoryLayout._WEIGHT_BANDWIDTH
106-
) # cout*cinMajor, 256b
89+
# (cout * cinMajor, Bits * cinSubtile)
90+
weight = weight.reshape(-1, bits * cinSubtile)
91+
# No padding needed here
92+
weightBandwidthBytes = int(np.ceil(bits * cinSubtile / 8))
10793

10894
# Prepare for packing
10995
# (-1, Weight Bandwidth Bytes, 8)
110-
weightBandwidthBytes = int(np.ceil(NeurekaMemoryLayout._WEIGHT_BANDWIDTH / 8))
11196
weight = np.stack(np.split(weight, weightBandwidthBytes, axis=-1), axis=-2)
11297

11398
# Pack bits
Lines changed: 156 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,156 @@
1+
# Luka Macan <luka.macan@unibo.it>
2+
# Arpan Suravi Prasad <prasadar@iis.ee.ethz.ch>
3+
#
4+
# Copyright 2023 ETH Zurich and University of Bologna
5+
#
6+
# Licensed under the Apache License, Version 2.0 (the "License");
7+
# you may not use this file except in compliance with the License.
8+
# You may obtain a copy of the License at
9+
#
10+
# http://www.apache.org/licenses/LICENSE-2.0
11+
#
12+
# Unless required by applicable law or agreed to in writing, software
13+
# distributed under the License is distributed on an "AS IS" BASIS,
14+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
# See the License for the specific language governing permissions and
16+
# limitations under the License.
17+
#
18+
# SPDX-License-Identifier: Apache-2.0
19+
20+
import numpy as np
21+
import numpy.typing as npt
22+
23+
24+
class NeurekaMemoryLayoutSiracusa:
25+
_WEIGHT_BANDWIDTH = 256
26+
_CIN_SUBTILE_1x1 = 32
27+
_CIN_SUBTILE_3x3 = 28
28+
29+
@staticmethod
30+
def weightEncode(
31+
weight: npt.NDArray[np.uint8], bits: int, depthwise: bool = False
32+
) -> npt.NDArray[np.uint8]:
33+
"""Unroll weight into expected memory format
34+
35+
Expected weight shape is (cout, cin, H, W).
36+
The produced memory layout depends on the weight kernel shape:
37+
- 3x3: (cout, cinMajor, Bits, H x W x cinMinor_3x3 packed into Weight Bandwidth bits),
38+
- 1x1: (cout, cinMajor, Bits x H x W x cinMinor_1x1 packed into Weight Bandwidth bits),
39+
where cinMajor is the ceil(cin / cin subtile <mode>) and cinMinor has to be padded with 0 to cin subtile <mode>.
40+
"""
41+
if depthwise:
42+
weight = weight.transpose(1, 0, 2, 3) # Swap cout and cin
43+
44+
cout, cin, height, width = weight.shape
45+
cinSubtile = (
46+
NeurekaMemoryLayoutSiracusa._CIN_SUBTILE_3x3
47+
if height == 3
48+
else NeurekaMemoryLayoutSiracusa._CIN_SUBTILE_1x1
49+
)
50+
51+
# Pad cin to be divisible with CIN_SUBTILE
52+
if cin % cinSubtile != 0:
53+
cinPad = cinSubtile - cin % cinSubtile
54+
weight = np.pad(
55+
weight,
56+
((0, 0), (0, cinPad), (0, 0), (0, 0)),
57+
"constant",
58+
constant_values=0,
59+
)
60+
61+
# Reshape into (cout, cinMajor, cinMinor, Flattened spatial, 1)
62+
# The 1 at the end is required by the unpacking
63+
cinMajor = int(np.ceil(cin / cinSubtile))
64+
weight = weight.reshape(cout, cinMajor, cinSubtile, height * width, 1)
65+
66+
# Unpack 'bits' bits in little order, e.g. bits=4: 3 => [1, 1, 0, 0]
67+
# (cout, cinMajor, cinSubtile, Flattened spatial, Bits)
68+
weight = np.unpackbits(weight, axis=-1, count=bits, bitorder="little")
69+
70+
# Shuffle bits so that the final shape is:
71+
# (cout, cinMajor, Bits, Flattened spatial, cinSubtile)
72+
weight = weight.transpose(0, 1, 4, 3, 2)
73+
74+
# Pack dimensions to fit into weight bandwidth
75+
if height == 3 and width == 3:
76+
# (cout * cinMajor * Bits, H * W * cinSubtile)
77+
weight = weight.reshape(-1, height * width * cinSubtile)
78+
# Pad only the last dimension to weight bandwidth size
79+
# (-1, Weight Bandwidth)
80+
weight = np.pad(
81+
weight,
82+
((0, 0), (0, NeurekaMemoryLayoutSiracusa._WEIGHT_BANDWIDTH - weight.shape[-1])),
83+
"constant",
84+
constant_values=0,
85+
)
86+
elif height == 1 and width == 1:
87+
# Tile cinSubtile into tiles of size 4
88+
# (cout, cinMajor, Bits, Flattened spatial, cinSubtileMajor, cinSubtileTile)
89+
weight = weight.reshape(
90+
cout, cinMajor, bits, height * width, cinSubtile // 4, 4
91+
) # cout, cinMajor, bits, 1, 8, 4
92+
# Pad bits to 8
93+
if bits < 8:
94+
# (cout, cinMajor, PaddedBits, Flattened spatial, cinSubtileMajor, cinSubtileTile)
95+
weight = np.pad(
96+
weight,
97+
((0, 0), (0, 0), (0, 8 - bits), (0, 0), (0, 0), (0, 0)),
98+
mode="constant",
99+
constant_values=0,
100+
)
101+
# (cout, cinMajor, Flattened spatial, cinSubtileMajor, PaddedBits, cinSubtileTile)
102+
weight = weight.transpose(0, 1, 3, 4, 2, 5)
103+
# (-1, Weight Bandwidth)
104+
weight = weight.reshape(
105+
cout * cinMajor, NeurekaMemoryLayoutSiracusa._WEIGHT_BANDWIDTH
106+
) # cout*cinMajor, 256b
107+
108+
# Prepare for packing
109+
# (-1, Weight Bandwidth Bytes, 8)
110+
weightBandwidthBytes = int(np.ceil(NeurekaMemoryLayoutSiracusa._WEIGHT_BANDWIDTH / 8))
111+
weight = np.stack(np.split(weight, weightBandwidthBytes, axis=-1), axis=-2)
112+
113+
# Pack bits
114+
# (-1, Weight Bandwidth Bytes)
115+
weight = np.packbits(weight, axis=-1, bitorder="little")
116+
117+
return weight.flatten()
118+
119+
@staticmethod
120+
def weightDecode(
121+
weight: npt.NDArray[np.uint8],
122+
bits: int,
123+
cout: int,
124+
cin: int,
125+
height: int,
126+
width: int,
127+
) -> npt.NDArray[np.uint8]:
128+
"""Reverse of weightEncode"""
129+
cinSubtile = (
130+
NeurekaMemoryLayoutSiracusa._CIN_SUBTILE_3x3
131+
if height == 3
132+
else NeurekaMemoryLayoutSiracusa._CIN_SUBTILE_1x1
133+
)
134+
cinMajor = int(np.ceil(cin / cinSubtile))
135+
cinMinor = cinSubtile
136+
weightBandwidthBytes = int(np.ceil(NeurekaMemoryLayoutSiracusa._WEIGHT_BANDWIDTH / 8))
137+
138+
weight = weight.reshape(-1, weightBandwidthBytes, 1)
139+
weight = np.unpackbits(weight, axis=-1, count=8, bitorder="little")
140+
weight = weight.reshape(-1, NeurekaMemoryLayoutSiracusa._WEIGHT_BANDWIDTH)
141+
142+
if height == 3 and width == 3:
143+
weight = weight[:, : height * width * cinMinor]
144+
weight = weight.reshape(
145+
cout, cinMajor, bits, height * width, cinMinor
146+
).transpose(0, 1, 4, 3, 2)
147+
elif height == 1 and width == 1:
148+
weight = weight[:, : height * width * cinMinor * 8]
149+
weight = weight.reshape(cout, cinMajor, cinMinor // 4, 8, 4).transpose(
150+
0, 1, 2, 4, 3
151+
)
152+
weight = np.packbits(weight, axis=-1, bitorder="little")
153+
weight = weight.reshape(cout, cinMajor * cinMinor, height, width)
154+
weight = weight[:, :cin, :, :]
155+
156+
return weight

test/NeurekaTestConf.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ def check_valid_out_type(cls, v: IntegerType) -> IntegerType:
6565
@field_validator("weight_type")
6666
@classmethod
6767
def check_valid_weight_type(cls, v: IntegerType) -> IntegerType:
68-
NeurekaTestConf._check_type("weight_type", v, ["int8"])
68+
NeurekaTestConf._check_type("weight_type", v, ["int8", "int7", "int6", "int5", "int4", "int3", "int2"])
6969
return v
7070

7171
@field_validator("scale_type")

0 commit comments

Comments
 (0)