Merge branch 'dev' into custom/transformer

auphelia · auphelia · commit 8d5295ce7e4d · 2025-12-19T14:17:01.000Z
diff --git a/finn-rtllib/mvu/mvu_vvu_8sx9_dsp58.sv b/finn-rtllib/mvu/mvu_vvu_8sx9_dsp58.sv
@@ -110,17 +110,17 @@ module mvu_vvu_8sx9_dsp58 #(
 	assign vld = L[0];
 
 //-------------------- Shift register for ZERO flag --------------------\\
-	logic Z [0:MAX_PIPELINE_STAGES-2] = '{default:0}; // We need MAX_PIPELINE_STAGES-1 pipeline stages (note: INMODE is buffered inside DSP fabric)
-
-	if (MAX_PIPELINE_STAGES > 1) begin : genZreg
+	// We need MAX_PIPELINE_STAGES-1 delay stages (INMODE is registed once more inside DSP)
+	uwire [MAX_PIPELINE_STAGES-1:0]  inmode_zero;
+	assign	inmode_zero[0] = zero;
+	if(MAX_PIPELINE_STAGES > 1) begin : genZReg
+		logic [MAX_PIPELINE_STAGES-1:1]  Z = '1;
 		always_ff @(posedge clk) begin
-			if (rst)      Z <= '{default: 0};
-			else if(en) begin
-				Z[0] <= zero;
-				if (MAX_PIPELINE_STAGES > 2)  Z[1:MAX_PIPELINE_STAGES-2] <= Z[0:MAX_PIPELINE_STAGES-3];
-			end
+			if(rst)      Z <= '1;
+			else if(en)  Z <= inmode_zero[MAX_PIPELINE_STAGES-2:0];
 		end
-	end;
+		assign	inmode_zero[MAX_PIPELINE_STAGES-1:1] = Z;
+	end : genZReg
 
 //-------------------- Buffer for input activations --------------------\\
 	localparam int unsigned PAD_BITS_ACT = 9 - ACTIVATION_WIDTH;
@@ -131,10 +131,10 @@ module mvu_vvu_8sx9_dsp58 #(
 			localparam int LANES_OCCUPIED = i == CHAINLEN-1 ? SIMD - 3*i : 3;
 
 			if (EXTERNAL_PREGS > 0) begin : genExternalPregAct
-				logic [0:EXTERNAL_PREGS-1][LANES_OCCUPIED-1:0][ACTIVATION_WIDTH-1:0] A = '{ default : 0};
+				(* EXTRACT_SHREG = "true" *)
+				logic [0:EXTERNAL_PREGS-1][LANES_OCCUPIED-1:0][ACTIVATION_WIDTH-1:0] A = '{ default : 'x };
 				always_ff @(posedge clk) begin
-					if (rst)     A <= '{default: 0};
-					else if(en) begin
+					if(en) begin
 						A[EXTERNAL_PREGS-1] <=
 // synthesis translate_off
 							zero ? '1 :
@@ -177,10 +177,10 @@ module mvu_vvu_8sx9_dsp58 #(
 			localparam int LANES_OCCUPIED = j == CHAINLEN-1 ? SIMD - 3*j : 3;
 
 			if (EXTERNAL_PREGS > 0) begin : genExternalPregWeight
-				logic [0:PE-1][0:EXTERNAL_PREGS-1][LANES_OCCUPIED-1:0][WEIGHT_WIDTH-1:0] B = '{ default : 0};
+				(* EXTRACT_SHREG = "true" *)
+				logic [0:PE-1][0:EXTERNAL_PREGS-1][LANES_OCCUPIED-1:0][WEIGHT_WIDTH-1:0] B = '{ default : 'x };
 				always_ff @(posedge clk) begin
-					if (rst)    B <= '{default: 0};
-					else if (en) begin
+					if(en) begin
 						B[i][EXTERNAL_PREGS-1] <=
 // synthesis translate_off
 							zero ? '1 :
@@ -253,7 +253,7 @@ module mvu_vvu_8sx9_dsp58 #(
 				logic InmodeZero = 0;
 				always_ff @(posedge clk) begin
 					if (rst)		InmodeZero <= 0;
-					else if (en)	InmodeZero <= ( TOTAL_PREGS > 0 ? Z[TOTAL_PREGS-1] : zero );
+					else if (en)	InmodeZero <= inmode_zero[TOTAL_PREGS];
 				end
 				always_ff @(posedge clk) begin
 					if (rst)	Mreg <= 0;
@@ -401,7 +401,7 @@ module mvu_vvu_8sx9_dsp58 #(
 					.INMODE({
 							INTERNAL_PREGS==2 ? 1'b0 : 1'b1,
 							2'b00,
-							TOTAL_PREGS > 0 ? Z[TOTAL_PREGS-1] : zero,
+							inmode_zero[TOTAL_PREGS],
 							INTERNAL_PREGS==2 ? 1'b0 : 1'b1
 					}),                                 // 5-bit input: INMODE control
 					.NEGATE('0),                        // 3-bit input: Negates the input of the multiplier
diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py
@@ -37,6 +37,7 @@
 from finn.custom_op.fpgadataflow.convolutioninputgenerator import (
     ConvolutionInputGenerator,
 )
+from finn.custom_op.fpgadataflow.crop import Crop
 from finn.custom_op.fpgadataflow.duplicatestreams import DuplicateStreams
 
 # Also import ElementwiseBinary variants
diff --git a/src/finn/custom_op/fpgadataflow/crop.py b/src/finn/custom_op/fpgadataflow/crop.py
@@ -0,0 +1,141 @@
+###################################################################################
+# Copyright (C) 2025, Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Copyright for portions of this file is held by AMD and Microsoft under
+# MIT license as part of project Brainsmith.
+# All other copyright is held by AMD and is provided under BSD-3-Clause license.
+#
+###################################################################################
+
+import numpy as np
+import warnings
+from qonnx.core.datatype import DataType
+
+from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp
+
+
+class Crop(HWCustomOp):
+    """Abstraction layer for Crop layers."""
+
+    def __init__(self, onnx_node, **kwargs):
+        super().__init__(onnx_node, **kwargs)
+
+    def get_nodeattr_types(self):
+        my_attrs = {
+            "DataType": ("s", True, ""),
+            "ImgDim": ("ints", True, []),  # [h, w]
+            "NumChannels": ("i", True, 0),
+            "CropNorth": ("i", True, []),
+            "CropSouth": ("i", True, []),
+            "CropWest": ("i", True, []),
+            "CropEast": ("i", True, []),
+            "SIMD": ("i", False, 1),
+            "numInputVectors": ("ints", False, []),
+        }
+        my_attrs.update(super().get_nodeattr_types())
+        return my_attrs
+
+    def get_normal_input_shape(self, ind=0):
+        num_vec = self.get_nodeattr("numInputVectors")
+        h, w = self.get_nodeattr("ImgDim")
+        if h == 0:
+            img_dim = [w]
+        else:
+            img_dim = [h, w]
+        ch = self.get_nodeattr("NumChannels")
+        return num_vec + img_dim + [ch] if num_vec != [0] else img_dim + [ch]
+
+    def get_normal_output_shape(self, ind=0):
+        num_vec = self.get_nodeattr("numInputVectors")
+        height, width = self.get_nodeattr("ImgDim")
+        ch = self.get_nodeattr("NumChannels")
+        crop_north = self.get_nodeattr("CropNorth")
+        crop_east = self.get_nodeattr("CropEast")
+        crop_west = self.get_nodeattr("CropWest")
+        crop_south = self.get_nodeattr("CropSouth")
+        owidth = width - (crop_west + crop_east)
+        oheight = height - (crop_north + crop_south)
+        if oheight == 0:
+            o_img_dim = [owidth]
+        else:
+            o_img_dim = [oheight, owidth]
+        return num_vec + o_img_dim + [ch] if num_vec != [0] else o_img_dim + [ch]
+
+    def execute_node(self, context, graph):
+        node = self.onnx_node
+        h, w = self.get_nodeattr("ImgDim")
+        crop_north = self.get_nodeattr("CropNorth")
+        crop_east = self.get_nodeattr("CropEast")
+        crop_west = self.get_nodeattr("CropWest")
+        crop_south = self.get_nodeattr("CropSouth")
+        inp = context[node.input[0]]
+        if len(inp.shape) == 3:
+            cropped_slice = inp[crop_north : h - crop_south, crop_west : w - crop_east, :]
+        elif len(inp.shape) == 2:
+            cropped_slice = inp[crop_west : w - crop_east, :]
+        elif len(inp.shape) == 4:
+            cropped_slice = inp[:, crop_north : h - crop_south, crop_west : w - crop_east, :]
+        else:
+            raise Exception("Crop execute node currently only supports 2D - 4D input tensors.")
+        assert cropped_slice.shape == tuple(self.get_normal_output_shape())
+        context[node.output[0]] = cropped_slice
+
+    def get_input_datatype(self, ind=0):
+        return DataType[self.get_nodeattr("DataType")]
+
+    def infer_node_datatype(self, model):
+        node = self.onnx_node
+        dt = model.get_tensor_datatype(node.input[0])
+        if dt != self.get_input_datatype():
+            warn_str = (
+                f"data_type changing for {node.name}: {str(self.get_input_datatype())} -> {str(dt)}"
+            )
+            warnings.warn(warn_str)
+        self.set_nodeattr("DataType", dt.name)
+
+    def get_instream_width(self, ind=0):
+        ibits = self.get_input_datatype().bitwidth()
+        simd = self.get_nodeattr("SIMD")
+        return ibits * simd
+
+    def get_outstream_width(self, ind=0):
+        obits = self.get_output_datatype().bitwidth()
+        simd = self.get_nodeattr("SIMD")
+        return obits * simd
+
+    def get_output_datatype(self, ind=0):
+        return DataType[self.get_nodeattr("DataType")]
+
+    def get_folded_output_shape(self, ind=0):
+        normal_oshape = list(self.get_normal_output_shape())
+        simd = self.get_nodeattr("SIMD")
+        assert normal_oshape[-1] % simd == 0, "Innermost dimension must be divisible by SIMD"
+        fold = int(normal_oshape[-1] / simd)
+        folded_oshape = normal_oshape[:-1] + [fold, simd]
+        return tuple(folded_oshape)
+
+    def get_folded_input_shape(self, ind=0):
+        normal_ishape = list(self.get_normal_input_shape())
+        simd = self.get_nodeattr("SIMD")
+        assert normal_ishape[-1] % simd == 0, "Innermost dimension must be divisible by SIMD"
+        fold = int(normal_ishape[-1] / simd)
+        folded_ishape = normal_ishape[:-1] + [fold, simd]
+        return tuple(folded_ishape)
+
+    def get_exp_cycles(self):
+        simd = self.get_nodeattr("SIMD")
+        num_vec = self.get_nodeattr("numInputVectors")
+        height, width = self.get_nodeattr("ImgDim")
+        ch = self.get_nodeattr("NumChannels")
+        if height == 0:
+            # pretend that height is 1 for code generation
+            height = 1
+
+        return (
+            np.prod(num_vec) * height * width * (ch // simd)
+            if num_vec != [0]
+            else height * width * (ch // simd)
+        )
diff --git a/src/finn/custom_op/fpgadataflow/hls/__init__.py b/src/finn/custom_op/fpgadataflow/hls/__init__.py
@@ -35,6 +35,7 @@
 from finn.custom_op.fpgadataflow.hls.channelwise_op_hls import ChannelwiseOp_hls
 from finn.custom_op.fpgadataflow.hls.checksum_hls import CheckSum_hls
 from finn.custom_op.fpgadataflow.hls.concat_hls import StreamingConcat_hls
+from finn.custom_op.fpgadataflow.hls.crop_hls import Crop_hls
 from finn.custom_op.fpgadataflow.hls.duplicatestreams_hls import DuplicateStreams_hls
 
 # Also import ElementwiseBinary variants
diff --git a/src/finn/custom_op/fpgadataflow/hls/crop_hls.py b/src/finn/custom_op/fpgadataflow/hls/crop_hls.py
@@ -0,0 +1,89 @@
+###################################################################################
+# Copyright (C) 2025, Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Copyright for portions of this file is held by AMD and Microsoft under
+# MIT license as part of project Brainsmith.
+# All other copyright is held by AMD and is provided under BSD-3-Clause license.
+#
+###################################################################################
+
+from finn.custom_op.fpgadataflow.crop import Crop
+from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend
+
+
+class Crop_hls(Crop, HLSBackend):
+    def __init__(self, onnx_node, **kwargs):
+        super().__init__(onnx_node, **kwargs)
+
+    def get_nodeattr_types(self):
+        return Crop.get_nodeattr_types(self) | HLSBackend.get_nodeattr_types(self)
+
+    def global_includes(self):
+        self.code_gen_dict["$GLOBALS$"] = [
+            '#include "crop.hpp"',
+        ]
+
+    def defines(self, var):
+        simd = self.get_nodeattr("SIMD")
+        dtype = self.get_input_datatype()
+        height, width = self.get_nodeattr("ImgDim")
+        if height == 0:
+            # pretend that height is 1 for code generation
+            height = 1
+        ch = self.get_nodeattr("NumChannels")
+        self.code_gen_dict["$DEFINES$"] = [
+            f"""
+            constexpr unsigned  SIMD      = {simd};
+            constexpr unsigned  H      = {height};
+            constexpr unsigned  W      = {width};
+            constexpr unsigned  CF     = {ch // simd};
+            constexpr unsigned  CROP_N = {self.get_nodeattr("CropNorth")};
+            constexpr unsigned  CROP_E = {self.get_nodeattr("CropEast")};
+            constexpr unsigned  CROP_S = {self.get_nodeattr("CropSouth")};
+            constexpr unsigned  CROP_W = {self.get_nodeattr("CropWest")};
+            using  TV = hls::vector<{dtype.get_hls_datatype_str()}, SIMD>;
+            """
+        ]
+
+    def docompute(self):
+        self.code_gen_dict["$DOCOMPUTE$"] = [
+            """
+            hls::stream<TV>  src0;
+            hls::stream<TV>  dst0;
+            #pragma HLS stream variable=src0 depth=2
+            #pragma HLS stream variable=dst0 depth=2
+
+            move(in0_V, src0);
+            crop< H, W,	CF, CROP_N, CROP_E, CROP_S, CROP_W, TV>(src0, dst0);
+            move(dst0, out0_V);
+            """
+        ]
+
+    def blackboxfunction(self):
+        self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
+            f"""
+            void {self.onnx_node.name} (
+                hls::stream<TV> &in0_V,
+                hls::stream<TV> &out0_V
+            )
+            """
+        ]
+
+    def pragmas(self):
+        self.code_gen_dict["$PRAGMAS$"] = [
+            """
+            #pragma HLS interface AXIS port=in0_V
+            #pragma HLS interface AXIS port=out0_V
+            #pragma HLS aggregate variable=in0_V compact=bit
+            #pragma HLS aggregate variable=out0_V compact=bit
+
+            #pragma HLS interface ap_ctrl_none port=return
+            #pragma HLS dataflow disable_start_propagation
+            """
+        ]
+
+    def execute_node(self, context, graph):
+        HLSBackend.execute_node(self, context, graph)
diff --git a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py
diff --git a/tests/fpgadataflow/test_fpgadataflow_crop.py b/tests/fpgadataflow/test_fpgadataflow_crop.py

Original file line number	Diff line number	Diff line change
`@@ -37,6 +37,7 @@`
`37`	`37`	`from finn.custom_op.fpgadataflow.convolutioninputgenerator import (`
`38`	`38`	`ConvolutionInputGenerator,`
`39`	`39`	`)`
	`40`	`+from finn.custom_op.fpgadataflow.crop import Crop`
`40`	`41`	`from finn.custom_op.fpgadataflow.duplicatestreams import DuplicateStreams`
`41`	`42`
`42`	`43`	`# Also import ElementwiseBinary variants`