diff --git a/docs/finn/source_code/finn.custom_op.fpgadataflow.rst b/docs/finn/source_code/finn.custom_op.fpgadataflow.rst index 84e9633304..26a2073e4a 100644 --- a/docs/finn/source_code/finn.custom_op.fpgadataflow.rst +++ b/docs/finn/source_code/finn.custom_op.fpgadataflow.rst @@ -39,6 +39,14 @@ RTLBackend :undoc-members: :show-inheritance: +finn.custom\_op.fpgadataflow.addclstoken +----------------------------------------- + +.. automodule:: finn.custom_op.fpgadataflow.addclstoken + :members: + :undoc-members: + :show-inheritance: + finn.custom\_op.fpgadataflow.addstreams ---------------------------------------- diff --git a/docs/finn/source_code/finn.custom_op.fpgadataflow.rtl.rst b/docs/finn/source_code/finn.custom_op.fpgadataflow.rtl.rst index 346eddb073..859a789f2f 100644 --- a/docs/finn/source_code/finn.custom_op.fpgadataflow.rtl.rst +++ b/docs/finn/source_code/finn.custom_op.fpgadataflow.rtl.rst @@ -5,6 +5,14 @@ Custom Op - fpgadataflow.rtl RTL Custom Op Nodes =================== +finn.custom\_op.fpgadataflow.rtl.addclstoken\_rtl +-------------------------------------------------- + +.. automodule:: finn.custom_op.fpgadataflow.rtl.addclstoken_rtl + :members: + :undoc-members: + :show-inheritance: + finn.custom\_op.fpgadataflow.convolutioninputgenerator\_rtl ------------------------------------------------------------ diff --git a/finn-rtllib/addclstoken/hdl/addclstoken.sv b/finn-rtllib/addclstoken/hdl/addclstoken.sv new file mode 100644 index 0000000000..d5bbdc2188 --- /dev/null +++ b/finn-rtllib/addclstoken/hdl/addclstoken.sv @@ -0,0 +1,139 @@ +/**************************************************************************** + * Copyright (C) 2026, Advanced Micro Devices, Inc. + * All rights reserved. + * + * SPDX-License-Identifier: BSD-3-Clause + * + * @brief Insert a constant class token into a folded token stream. + * @author Oliver Cassidy + * + * @description + * Prepends a learned class token, supplied through cls_data, to each + * input sequence of patch tokens. The class token and patch tokens are + * transferred as SIMD-wide folds of ELEM_WIDTH-bit elements. + * + * Per sequence, the output stream is: + * 1. NUM_CHANNELS/SIMD folds from cls_data + * 2. NUM_TOKENS pass-through input tokens + * 3. PAD_TOKENS zero-valued tokens, when padding is enabled + ***************************************************************************/ + +module addclstoken #( + parameter int unsigned NUM_TOKENS = 196, + parameter int unsigned NUM_CHANNELS = 192, + parameter int unsigned SIMD = 1, + parameter int unsigned ELEM_WIDTH = 8, + parameter int unsigned PAD_TOKENS = 0 +)( + input logic clk, + input logic rst, + + output logic irdy, + input logic ivld, + input logic [SIMD*ELEM_WIDTH-1:0] idat, + + input logic ordy, + output logic ovld, + output logic [SIMD*ELEM_WIDTH-1:0] odat, + + input logic [NUM_CHANNELS*ELEM_WIDTH-1:0] cls_data +); + + localparam int unsigned FOLD_WIDTH = SIMD * ELEM_WIDTH; + localparam int unsigned FOLDS_PER_TOKEN = NUM_CHANNELS / SIMD; + localparam int unsigned TOTAL_INPUT_FOLDS = NUM_TOKENS * FOLDS_PER_TOKEN; + localparam int unsigned TOTAL_PAD_FOLDS = PAD_TOKENS * FOLDS_PER_TOKEN; + localparam int unsigned MAX_PHASE_FOLDS = + (TOTAL_INPUT_FOLDS > FOLDS_PER_TOKEN) ? + ((TOTAL_INPUT_FOLDS > TOTAL_PAD_FOLDS) ? + TOTAL_INPUT_FOLDS : TOTAL_PAD_FOLDS) : + ((FOLDS_PER_TOKEN > TOTAL_PAD_FOLDS) ? + FOLDS_PER_TOKEN : TOTAL_PAD_FOLDS); + localparam int unsigned CNT_WIDTH = (MAX_PHASE_FOLDS <= 1) ? 1 : $clog2(MAX_PHASE_FOLDS); + + typedef enum logic [1:0] { + EMIT_CLS, + PASSTHROUGH, + EMIT_PAD + } state_t; + + state_t state; + state_t next_state; + logic [CNT_WIDTH-1:0] fold_cnt; + logic fold_cnt_last; + logic out_transfer; + + logic [CNT_WIDTH-1:0] cls_fold_cnt; + logic [FOLD_WIDTH-1:0] cls_fold; + + assign cls_fold_cnt = (int'(fold_cnt) < FOLDS_PER_TOKEN) ? fold_cnt : '0; + assign cls_fold = cls_data[cls_fold_cnt * FOLD_WIDTH +: FOLD_WIDTH]; + assign out_transfer = ovld & ordy; + + always_comb begin + unique case (state) + EMIT_CLS: fold_cnt_last = (int'(fold_cnt) == FOLDS_PER_TOKEN - 1); + PASSTHROUGH: fold_cnt_last = (int'(fold_cnt) == TOTAL_INPUT_FOLDS - 1); + EMIT_PAD: fold_cnt_last = (int'(fold_cnt) == TOTAL_PAD_FOLDS - 1); + default: fold_cnt_last = 1'b1; + endcase + end + + always_comb begin + irdy = 1'b0; + ovld = 1'b0; + odat = '0; + + unique case (state) + EMIT_CLS: begin + ovld = 1'b1; + odat = cls_fold; + end + PASSTHROUGH: begin + irdy = ordy; + ovld = ivld; + odat = idat; + end + EMIT_PAD: begin + ovld = 1'b1; + end + default: begin + end + endcase + end + + always_comb begin + next_state = state; + if (out_transfer && fold_cnt_last) begin + unique case (state) + EMIT_CLS: begin + next_state = PASSTHROUGH; + end + PASSTHROUGH: begin + next_state = (PAD_TOKENS == 0) ? EMIT_CLS : EMIT_PAD; + end + EMIT_PAD: begin + next_state = EMIT_CLS; + end + default: begin + next_state = EMIT_CLS; + end + endcase + end + end + + always_ff @(posedge clk) begin + if (rst) begin + state <= EMIT_CLS; + fold_cnt <= '0; + end else if (out_transfer) begin + if (fold_cnt_last) begin + state <= next_state; + fold_cnt <= '0; + end else begin + fold_cnt <= fold_cnt + 1'b1; + end + end + end + +endmodule diff --git a/finn-rtllib/addclstoken/hdl/addclstoken_template.v b/finn-rtllib/addclstoken/hdl/addclstoken_template.v new file mode 100644 index 0000000000..d38dd72bed --- /dev/null +++ b/finn-rtllib/addclstoken/hdl/addclstoken_template.v @@ -0,0 +1,81 @@ +/****************************************************************************** + * Copyright (C) 2026, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ + +module $TOP_MODULE_NAME$ #( + parameter FOLD_WIDTH = $FOLD_WIDTH$, + parameter AXI_WIDTH = ((FOLD_WIDTH + 7) / 8) * 8 +)( + (* X_INTERFACE_INFO = "xilinx.com:signal:clock:1.0 ap_clk CLK" *) + (* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF in0_V:out0_V, ASSOCIATED_RESET ap_rst_n" *) + input ap_clk, + (* X_INTERFACE_PARAMETER = "POLARITY ACTIVE_LOW" *) + input ap_rst_n, + + output in0_V_TREADY, + input in0_V_TVALID, + input [AXI_WIDTH-1:0] in0_V_TDATA, + + input out0_V_TREADY, + output out0_V_TVALID, + output [AXI_WIDTH-1:0] out0_V_TDATA +); + + localparam [$CLS_WIDTH$-1:0] CLS_DATA = $CLS_DATA$; + + wire [FOLD_WIDTH-1:0] core_out; + + assign out0_V_TDATA[FOLD_WIDTH-1:0] = core_out; + + generate + if (AXI_WIDTH > FOLD_WIDTH) begin : gen_pad_tdata + assign out0_V_TDATA[AXI_WIDTH-1:FOLD_WIDTH] = {(AXI_WIDTH-FOLD_WIDTH){1'b0}}; + end + endgenerate + + addclstoken #( + .NUM_TOKENS($NUM_TOKENS$), + .NUM_CHANNELS($NUM_CHANNELS$), + .SIMD($SIMD$), + .ELEM_WIDTH($ELEM_WIDTH$), + .PAD_TOKENS($PAD_TOKENS$) + ) impl ( + .clk(ap_clk), + .rst(!ap_rst_n), + .irdy(in0_V_TREADY), + .ivld(in0_V_TVALID), + .idat(in0_V_TDATA[FOLD_WIDTH-1:0]), + .ordy(out0_V_TREADY), + .ovld(out0_V_TVALID), + .odat(core_out), + .cls_data(CLS_DATA) + ); + +endmodule diff --git a/src/finn/builder/build_dataflow_steps.py b/src/finn/builder/build_dataflow_steps.py index 7c5d27dfb9..ca15a01c07 100644 --- a/src/finn/builder/build_dataflow_steps.py +++ b/src/finn/builder/build_dataflow_steps.py @@ -500,6 +500,9 @@ def apply_if_relevant(model, op_types, transform, desc=""): ) # Streaming operations + model = apply_if_relevant( + model, ["Concat"], to_hw.InferAddCLSTokenLayer(), "CLS token insertion" + ) model = apply_if_relevant(model, ["Concat"], to_hw.InferConcatLayer(), "concat layers") model = apply_if_relevant(model, ["Split"], to_hw.InferSplitLayer(), "split layers") diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py index f05198837b..c00a1d5054 100644 --- a/src/finn/custom_op/fpgadataflow/__init__.py +++ b/src/finn/custom_op/fpgadataflow/__init__.py @@ -52,6 +52,7 @@ def register_custom_op(cls): # Import the submodule containing specializations of ElementwiseBinaryOperation # Note: This will automatically register all decorated classes into this domain import finn.custom_op.fpgadataflow.elementwise_binary +from finn.custom_op.fpgadataflow.addclstoken import AddCLSToken from finn.custom_op.fpgadataflow.concat import StreamingConcat from finn.custom_op.fpgadataflow.convolutioninputgenerator import ( ConvolutionInputGenerator, @@ -91,6 +92,7 @@ def register_custom_op(cls): custom_op["VVAU"] = VVAU custom_op["StreamingDataflowPartition"] = StreamingDataflowPartition +custom_op["AddCLSToken"] = AddCLSToken custom_op["ConvolutionInputGenerator"] = ConvolutionInputGenerator custom_op["Crop"] = Crop custom_op["DuplicateStreams"] = DuplicateStreams diff --git a/src/finn/custom_op/fpgadataflow/addclstoken.py b/src/finn/custom_op/fpgadataflow/addclstoken.py new file mode 100644 index 0000000000..35eae4bb29 --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/addclstoken.py @@ -0,0 +1,171 @@ +# Copyright (C) 2026, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import numpy as np +import warnings +from qonnx.core.datatype import DataType + +from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp + + +class AddCLSToken(HWCustomOp): + """Prepend a learned class token to a sequence of patch tokens.""" + + def __init__(self, onnx_node, **kwargs): + super().__init__(onnx_node, **kwargs) + + def get_nodeattr_types(self): + my_attrs = super().get_nodeattr_types() + my_attrs.update( + { + "NumTokens": ("i", True, 0), + "NumChannels": ("i", True, 0), + "PadTokens": ("i", False, 0), + "SIMD": ("i", False, 1), + "inputDataType": ("s", True, ""), + "outputDataType": ("s", False, ""), + } + ) + return my_attrs + + def get_normal_input_shape(self, ind=0): + num_channels = self.get_nodeattr("NumChannels") + if ind == 0: + return (1, self.get_nodeattr("NumTokens"), num_channels) + elif ind == 1: + return (1, 1, num_channels) + else: + raise Exception("AddCLSToken only has two inputs") + + def get_folded_input_shape(self, ind=0): + normal_shape = self.get_normal_input_shape(ind) + simd = self.get_nodeattr("SIMD") + num_channels = normal_shape[-1] + assert num_channels % simd == 0, "SIMD must divide NumChannels" + return normal_shape[:-1] + (num_channels // simd, simd) + + def get_normal_output_shape(self, ind=0): + num_tokens = self.get_nodeattr("NumTokens") + num_channels = self.get_nodeattr("NumChannels") + pad_tokens = self.get_nodeattr("PadTokens") + return (1, num_tokens + 1 + pad_tokens, num_channels) + + def get_folded_output_shape(self, ind=0): + normal_shape = self.get_normal_output_shape(ind) + simd = self.get_nodeattr("SIMD") + num_channels = normal_shape[-1] + assert num_channels % simd == 0, "SIMD must divide NumChannels" + return normal_shape[:-1] + (num_channels // simd, simd) + + def make_shape_compatible_op(self, model): + exp_ishape = self.get_normal_input_shape(0) + ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0])) + assert ishape == exp_ishape, "Unexpected input shape for patch tokens." + + exp_wshape = self.get_normal_input_shape(1) + wshape = tuple(model.get_tensor_shape(self.onnx_node.input[1])) + assert wshape == exp_wshape, "Unexpected input shape for CLS token." + + return super().make_const_shape_op(self.get_normal_output_shape()) + + def infer_node_datatype(self, model): + node = self.onnx_node + attr_idt = None + if self.get_nodeattr("inputDataType") != "": + attr_idt = self.get_input_datatype() + + idt = model.get_tensor_datatype(node.input[0]) + if idt is None: + idt = attr_idt + if idt is None: + raise Exception("AddCLSToken input datatype is not set") + + if attr_idt is not None and attr_idt != idt: + warnings.warn( + "inputDataType changing for %s: %s -> %s" % (node.name, str(attr_idt), str(idt)) + ) + self.set_nodeattr("inputDataType", idt.name) + + cls_dt = model.get_tensor_datatype(node.input[1]) + if cls_dt is None: + model.set_tensor_datatype(node.input[1], idt) + else: + assert cls_dt == idt, "CLS token datatype must match input datatype." + + self.set_nodeattr("outputDataType", idt.name) + model.set_tensor_datatype(node.output[0], idt) + + def verify_node(self): + pass + + def get_input_datatype(self, ind=0): + return DataType[self.get_nodeattr("inputDataType")] + + def get_output_datatype(self, ind=0): + odt = self.get_nodeattr("outputDataType") + if odt == "": + return self.get_input_datatype(ind) + return DataType[odt] + + def get_instream_width(self, ind=0): + if ind != 0: + return 0 + return self.get_input_datatype().bitwidth() * self.get_nodeattr("SIMD") + + def get_outstream_width(self, ind=0): + return self.get_output_datatype().bitwidth() * self.get_nodeattr("SIMD") + + def get_number_output_values(self): + return int(np.prod(self.get_folded_output_shape()[:-1])) + + def get_exp_cycles(self): + return int(np.prod(self.get_folded_output_shape()[:-1])) + + def execute_node(self, context, graph): + node = self.onnx_node + patches = context[node.input[0]] + cls_token = context[node.input[1]] + + result = np.concatenate([cls_token, patches], axis=1) + pad_tokens = self.get_nodeattr("PadTokens") + if pad_tokens > 0: + pad_shape = (1, pad_tokens, self.get_nodeattr("NumChannels")) + padding = np.zeros(pad_shape, dtype=result.dtype) + result = np.concatenate([result, padding], axis=1) + + oshape = self.get_normal_output_shape() + context[node.output[0]] = np.asarray(result, dtype=np.float32).reshape(oshape) + + def bram_estimation(self): + return 0 + + def lut_estimation(self): + return int(128 + self.get_nodeattr("NumChannels")) + + def get_op_and_param_counts(self): + return {"param_cls_token": int(self.get_nodeattr("NumChannels"))} diff --git a/src/finn/custom_op/fpgadataflow/rtl/__init__.py b/src/finn/custom_op/fpgadataflow/rtl/__init__.py index 520fcdcd12..fd3df3fbb7 100644 --- a/src/finn/custom_op/fpgadataflow/rtl/__init__.py +++ b/src/finn/custom_op/fpgadataflow/rtl/__init__.py @@ -26,6 +26,7 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +from finn.custom_op.fpgadataflow.rtl.addclstoken_rtl import AddCLSToken_rtl from finn.custom_op.fpgadataflow.rtl.convolutioninputgenerator_rtl import ( ConvolutionInputGenerator_rtl, ) @@ -51,6 +52,7 @@ # make sure new HLSCustomOp subclasses are imported here so that they get # registered and plug in correctly into the infrastructure +custom_op["AddCLSToken_rtl"] = AddCLSToken_rtl custom_op["ConvolutionInputGenerator_rtl"] = ConvolutionInputGenerator_rtl custom_op["ElementwiseAdd_rtl"] = ElementwiseAdd_rtl custom_op["ElementwiseSub_rtl"] = ElementwiseSub_rtl diff --git a/src/finn/custom_op/fpgadataflow/rtl/addclstoken_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/addclstoken_rtl.py new file mode 100644 index 0000000000..8ca3daec88 --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/rtl/addclstoken_rtl.py @@ -0,0 +1,168 @@ +# Copyright (C) 2026, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import numpy as np +import os +import shutil +from qonnx.core.datatype import DataType + +from finn.custom_op.fpgadataflow.addclstoken import AddCLSToken +from finn.custom_op.fpgadataflow.rtlbackend import RTLBackend + + +def _rtlsrc_dir(): + return os.environ["FINN_ROOT"] + "/finn-rtllib/addclstoken/hdl" + + +class AddCLSToken_rtl(AddCLSToken, RTLBackend): + """RTL implementation of AddCLSToken.""" + + def __init__(self, onnx_node, **kwargs): + super().__init__(onnx_node, **kwargs) + + def get_nodeattr_types(self): + my_attrs = {} + my_attrs.update(AddCLSToken.get_nodeattr_types(self)) + my_attrs.update(RTLBackend.get_nodeattr_types(self)) + return my_attrs + + def _pack_value(self, value, dtype): + bitwidth = dtype.bitwidth() + if dtype == DataType["BIPOLAR"]: + int_value = int((value + 1) // 2) + else: + if dtype.is_fixed_point(): + value = value / dtype.scale_factor() + int_value = int(value) + if int_value < 0: + int_value += 1 << bitwidth + return int_value & ((1 << bitwidth) - 1) + + def _pack_cls_token(self, model): + dtype = self.get_input_datatype() + bitwidth = dtype.bitwidth() + num_channels = self.get_nodeattr("NumChannels") + cls_token = model.get_initializer(self.onnx_node.input[1]) + if cls_token is None: + raise Exception("AddCLSToken RTL generation requires a constant CLS token input.") + + cls_token = np.asarray(cls_token, dtype=np.float32) + assert cls_token.shape == self.get_normal_input_shape( + 1 + ), "CLS token shape does not match AddCLSToken attributes." + assert np.vectorize(dtype.allowed)(cls_token).all(), ( + "CLS token values cannot be represented with %s" % dtype.name + ) + packed = 0 + for i, value in enumerate(cls_token.flatten()): + packed |= self._pack_value(value, dtype) << (i * bitwidth) + return "%d'h%x" % (num_channels * bitwidth, packed) + + def generate_hdl(self, model, fpgapart, clk): + simd = self.get_nodeattr("SIMD") + num_channels = self.get_nodeattr("NumChannels") + assert num_channels % simd == 0, "SIMD must divide NumChannels" + + rtlsrc = _rtlsrc_dir() + template_path = rtlsrc + "/addclstoken_template.v" + with open(template_path, "r") as f: + template = f.read() + + topname = self.get_verilog_top_module_name() + self.set_nodeattr("gen_top_module", topname) + + elem_width = self.get_input_datatype().bitwidth() + fold_width = elem_width * simd + code_gen_dict = { + "TOP_MODULE_NAME": topname, + "NUM_TOKENS": self.get_nodeattr("NumTokens"), + "NUM_CHANNELS": num_channels, + "SIMD": simd, + "ELEM_WIDTH": elem_width, + "PAD_TOKENS": self.get_nodeattr("PadTokens"), + "FOLD_WIDTH": fold_width, + "CLS_WIDTH": num_channels * elem_width, + "CLS_DATA": self._pack_cls_token(model), + } + + for key, value in code_gen_dict.items(): + template = template.replace("$%s$" % key, str(value)) + + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + with open(os.path.join(code_gen_dir, topname + ".v"), "w") as f: + f.write(template) + shutil.copy(rtlsrc + "/addclstoken.sv", code_gen_dir) + + self.set_nodeattr("ipgen_path", code_gen_dir) + self.set_nodeattr("ip_path", code_gen_dir) + + def get_rtl_file_list(self, abspath=False): + if abspath: + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + "/" + rtllib_dir = _rtlsrc_dir() + "/" + else: + code_gen_dir = "" + rtllib_dir = "" + + verilog_files = [ + rtllib_dir + "addclstoken.sv", + code_gen_dir + self.get_nodeattr("gen_top_module") + ".v", + ] + return verilog_files + + def get_rtlsim_input_indices(self): + """Only patch tokens are streamed; CLS token data is embedded in generated RTL.""" + return [0] + + def code_generation_ipi(self): + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + sourcefiles = self.get_rtl_file_list() + sourcefiles = [os.path.join(code_gen_dir, f) for f in sourcefiles] + + cmd = [] + for f in sourcefiles: + cmd += ["add_files -norecurse %s" % f] + cmd += [ + "create_bd_cell -type module -reference %s %s" + % (self.get_nodeattr("gen_top_module"), self.onnx_node.name) + ] + return cmd + + def execute_node(self, context, graph): + mode = self.get_nodeattr("exec_mode") + if mode == "cppsim": + AddCLSToken.execute_node(self, context, graph) + elif mode == "rtlsim": + RTLBackend.execute_node(self, context, graph) + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following values ("cppsim", "rtlsim")""".format( + mode + ) + ) diff --git a/src/finn/custom_op/fpgadataflow/rtlbackend.py b/src/finn/custom_op/fpgadataflow/rtlbackend.py index 8635a96550..2b8db0310e 100644 --- a/src/finn/custom_op/fpgadataflow/rtlbackend.py +++ b/src/finn/custom_op/fpgadataflow/rtlbackend.py @@ -85,6 +85,10 @@ def code_generation_ipi(self): def code_generation_ipgen(self, model, fpgapart, clk): self.generate_hdl(model, fpgapart, clk) + def get_rtlsim_input_indices(self): + """Return ONNX input indices that are driven as RTLSim input streams.""" + return range(len(self.onnx_node.input)) + def execute_node(self, context, graph): mode = self.get_nodeattr("exec_mode") code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") @@ -92,7 +96,10 @@ def execute_node(self, context, graph): if mode == "rtlsim": node = self.onnx_node inputs = {} - for i, inp in enumerate(node.input): + for i in self.get_rtlsim_input_indices(): + inp = node.input[i] + nbits = self.get_instream_width(i) + assert nbits > 0, "RTLSim input stream %d has zero width." % i exp_ishape = tuple(self.get_normal_input_shape(i)) folded_ishape = self.get_folded_input_shape(i) inp_val = context[inp] @@ -102,7 +109,6 @@ def execute_node(self, context, graph): reshaped_input = inp_val.reshape(folded_ishape) np.save(os.path.join(code_gen_dir, "input_%s.npy" % i), reshaped_input) - nbits = self.get_instream_width(i) rtlsim_inp = npy_to_rtlsim_input( "{}/input_{}.npy".format(code_gen_dir, i), export_idt, nbits ) diff --git a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py index f7b7beee14..994905c9c6 100644 --- a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py +++ b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py @@ -1334,6 +1334,85 @@ def apply(self, model): return (model, graph_modified) +class InferAddCLSTokenLayer(Transformation): + """Convert Concat([cls_token, patches], axis=1) into AddCLSToken.""" + + def apply(self, model): + graph = model.graph + node_ind = 0 + graph_modified = False + for node in graph.node: + node_ind += 1 + if node.op_type != "Concat": + continue + + axis = get_by_name(node.attribute, "axis") + if axis is None or len(node.input) != 2: + continue + + cls_name = node.input[0] + patch_name = node.input[1] + cls_init = model.get_initializer(cls_name) + if cls_init is None or model.get_initializer(patch_name) is not None: + continue + + cls_shape = model.get_tensor_shape(cls_name) + if cls_shape is None: + cls_shape = list(cls_init.shape) + patch_shape = model.get_tensor_shape(patch_name) + if cls_shape is None or patch_shape is None: + continue + if any(x is None for x in list(cls_shape) + list(patch_shape)): + continue + + rank = len(patch_shape) + concat_axis = axis.i if axis.i >= 0 else axis.i + rank + if rank != 3 or concat_axis != 1: + continue + + if len(cls_shape) != 3 or cls_shape[0] != 1 or cls_shape[1] != 1: + continue + if patch_shape[0] != 1 or cls_shape[2] != patch_shape[2]: + continue + + out_shape = model.get_tensor_shape(node.output[0]) + exp_oshape = [1, patch_shape[1] + 1, patch_shape[2]] + if out_shape is not None and list(out_shape) != exp_oshape: + continue + + idt = model.get_tensor_datatype(patch_name) + if idt is None or not idt.is_integer(): + continue + cls_dt = model.get_tensor_datatype(cls_name) + if cls_dt is None: + model.set_tensor_datatype(cls_name, idt) + elif cls_dt != idt: + continue + + new_node = helper.make_node( + "AddCLSToken", + [patch_name, cls_name], + node.output, + domain="finn.custom_op.fpgadataflow", + backend="fpgadataflow", + name="AddCLSToken_" + node.name, + NumTokens=int(patch_shape[1]), + NumChannels=int(patch_shape[2]), + PadTokens=0, + SIMD=1, + inputDataType=idt.name, + outputDataType=idt.name, + ) + graph.node.insert(node_ind, new_node) + graph.node.remove(node) + graph_modified = True + + if graph_modified: + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) + return (model, graph_modified) + + class InferStreamingEltwise(Transformation): """ DEPRECATED: This transformation is deprecated and now redirects to diff --git a/tests/fpgadataflow/test_fpgadataflow_addclstoken.py b/tests/fpgadataflow/test_fpgadataflow_addclstoken.py new file mode 100644 index 0000000000..7e57c3ef0e --- /dev/null +++ b/tests/fpgadataflow/test_fpgadataflow_addclstoken.py @@ -0,0 +1,299 @@ +# Copyright (C) 2026, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import pytest + +import numpy as np +from functools import partial +from onnx import TensorProto, helper, numpy_helper +from qonnx.core.datatype import DataType +from qonnx.core.modelwrapper import ModelWrapper +from qonnx.custom_op.registry import getCustomOp +from qonnx.transformation.general import GiveUniqueNodeNames + +from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer +from finn.analysis.fpgadataflow.res_estimation import ( + res_estimation, + res_estimation_complete, +) +from finn.core.onnx_exec import execute_onnx +from finn.transformation.fpgadataflow.convert_to_hw_layers import InferAddCLSTokenLayer +from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP +from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP +from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO +from finn.transformation.fpgadataflow.prepare_ip import PrepareIP +from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim +from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers +from finn.transformation.fpgadataflow.synth_ooc import SynthOutOfContext + +FPGA_PART = "xc7z020clg400-1" +CLK_NS = 10 + + +def _make_graph(nodes, output_shape, cls_values, finn_dtype=DataType["INT8"]): + patch_shape = [1, 3, 4] + patches = helper.make_tensor_value_info("patches", TensorProto.FLOAT, patch_shape) + output = helper.make_tensor_value_info("out", TensorProto.FLOAT, output_shape) + cls_init = numpy_helper.from_array(cls_values.astype(np.float32), name="cls") + graph = helper.make_graph(nodes, "addclstoken_test", [patches], [output], [cls_init]) + model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 11)]) + model = ModelWrapper(model) + for tensor_name in ["patches", "cls", "out"]: + model.set_tensor_datatype(tensor_name, finn_dtype) + return model + + +def _make_concat_model(): + cls_values = np.asarray([[[1, -2, 3, -4]]], dtype=np.float32) + concat = helper.make_node( + "Concat", + ["cls", "patches"], + ["out"], + axis=1, + name="concat_cls", + ) + model = _make_graph([concat], [1, 4, 4], cls_values) + return model, cls_values + + +def _make_addclstoken_model( + pad_tokens=0, + simd=1, + finn_dtype=DataType["INT8"], + cls_values=None, +): + if cls_values is None: + cls_values = np.asarray([[[1, -2, 3, -4]]], dtype=np.float32) + addcls = helper.make_node( + "AddCLSToken", + ["patches", "cls"], + ["out"], + domain="finn.custom_op.fpgadataflow", + backend="fpgadataflow", + name="AddCLSToken_0", + NumTokens=3, + NumChannels=4, + PadTokens=pad_tokens, + SIMD=simd, + inputDataType=finn_dtype.name, + outputDataType=finn_dtype.name, + ) + model = _make_graph([addcls], [1, 4 + pad_tokens, 4], cls_values, finn_dtype) + return model, cls_values + + +def _prepare_addclstoken_stitched_ip_model(simd=1, pad_tokens=0): + model, cls_values = _make_addclstoken_model(pad_tokens=pad_tokens, simd=simd) + model = model.transform(SpecializeLayers(FPGA_PART)) + model = model.transform(InsertFIFO(create_shallow_fifos=True)) + model = model.transform(SpecializeLayers(FPGA_PART)) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(PrepareIP(FPGA_PART, CLK_NS)) + model = model.transform(HLSSynthIP()) + model = model.transform(CreateStitchedIP(FPGA_PART, CLK_NS, vitis=False)) + return model, cls_values + + +def _make_input_dict(model, patches): + return {model.graph.input[0].name: patches} + + +@pytest.mark.fpgadataflow +def test_convert_concat_to_addclstoken(): + model, cls_values = _make_concat_model() + patches = np.arange(12, dtype=np.float32).reshape(1, 3, 4) + expected = np.concatenate([cls_values, patches], axis=1) + + ret = execute_onnx(model, _make_input_dict(model, patches)) + assert (ret["out"] == expected).all() + + model = model.transform(InferAddCLSTokenLayer()) + node = model.graph.node[0] + assert node.op_type == "AddCLSToken" + assert node.domain == "finn.custom_op.fpgadataflow" + assert list(node.input) == ["patches", "cls"] + + inst = getCustomOp(node) + assert inst.get_normal_output_shape() == (1, 4, 4) + assert inst.get_exp_cycles() == 16 + + ret = execute_onnx(model, _make_input_dict(model, patches)) + assert (ret["out"] == expected).all() + + model = model.transform(SpecializeLayers(FPGA_PART)) + model = model.transform(GiveUniqueNodeNames()) + assert model.graph.node[0].op_type == "AddCLSToken_rtl" + assert model.graph.node[0].domain == "finn.custom_op.fpgadataflow.rtl" + + +@pytest.mark.fpgadataflow +def test_addclstoken_python_execution_with_padding(): + model, cls_values = _make_addclstoken_model(pad_tokens=2) + patches = np.arange(12, dtype=np.float32).reshape(1, 3, 4) + expected = np.concatenate( + [cls_values, patches, np.zeros((1, 2, 4), dtype=np.float32)], + axis=1, + ) + + ret = execute_onnx(model, _make_input_dict(model, patches)) + assert (ret["out"] == expected).all() + + +@pytest.mark.fpgadataflow +@pytest.mark.parametrize( + "finn_dtype,cls_values,expected_cls_data", + [ + (DataType["INT8"], np.asarray([[[1, -2, 3, -4]]], dtype=np.float32), "32'hfc03fe01"), + (DataType["UINT4"], np.asarray([[[1, 2, 3, 4]]], dtype=np.float32), "16'h4321"), + (DataType["BIPOLAR"], np.asarray([[[1, -1, 1, -1]]], dtype=np.float32), "4'h5"), + ], +) +def test_addclstoken_rtl_codegen(tmp_path, finn_dtype, cls_values, expected_cls_data): + model, _ = _make_addclstoken_model( + pad_tokens=1, + simd=2, + finn_dtype=finn_dtype, + cls_values=cls_values, + ) + model = model.transform(SpecializeLayers(FPGA_PART)) + model = model.transform(GiveUniqueNodeNames()) + + node = model.graph.node[0] + inst = getCustomOp(node) + inst.set_nodeattr("code_gen_dir_ipgen", str(tmp_path)) + inst.code_generation_ipgen(model, FPGA_PART, CLK_NS) + + topname = inst.get_nodeattr("gen_top_module") + assert topname == node.name + wrapper = tmp_path / (topname + ".v") + core = tmp_path / "addclstoken.sv" + assert wrapper.is_file() + assert core.is_file() + wrapper_text = wrapper.read_text() + assert "parameter FOLD_WIDTH = %d" % (2 * finn_dtype.bitwidth()) in wrapper_text + assert ".SIMD(2)" in wrapper_text + assert ".PAD_TOKENS(1)" in wrapper_text + assert "CLS_DATA = %s" % expected_cls_data in wrapper_text + assert "out0_V_TVALID" in wrapper_text + assert "= '0" not in wrapper_text + + ipi_cmds = inst.code_generation_ipi() + assert any("addclstoken.sv" in cmd for cmd in ipi_cmds) + assert any("create_bd_cell" in cmd and topname in cmd for cmd in ipi_cmds) + + +@pytest.mark.fpgadataflow +def test_addclstoken_resource_estimation(): + model, _ = _make_addclstoken_model(pad_tokens=1, simd=2) + model = model.transform(SpecializeLayers(FPGA_PART)) + model = model.transform(GiveUniqueNodeNames()) + + expected = { + "BRAM_18K": 0, + "BRAM_efficiency": 1, + "LUT": 132, + "URAM": 0, + "URAM_efficiency": 1, + "DSP": 0, + } + resources = model.analysis(partial(res_estimation, fpgapart=FPGA_PART)) + assert len(resources) == 1 + assert list(resources.values())[0] == expected + + complete_resources = model.analysis(partial(res_estimation_complete, fpgapart=FPGA_PART)) + assert len(complete_resources) == 1 + assert list(complete_resources.values())[0] == [expected] + + +@pytest.mark.fpgadataflow +@pytest.mark.vivado +@pytest.mark.slow +@pytest.mark.parametrize("simd,pad_tokens", [(1, 0), (2, 1)]) +def test_addclstoken_rtlsim(simd, pad_tokens): + model, cls_values = _make_addclstoken_model(pad_tokens=pad_tokens, simd=simd) + patches = np.arange(12, dtype=np.float32).reshape(1, 3, 4) + expected_values = [cls_values, patches] + if pad_tokens > 0: + expected_values.append(np.zeros((1, pad_tokens, 4), dtype=np.float32)) + expected = np.concatenate(expected_values, axis=1) + + model = model.transform(SpecializeLayers(FPGA_PART)) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(PrepareIP(FPGA_PART, CLK_NS)) + model = model.transform(SetExecMode("rtlsim")) + model = model.transform(PrepareRTLSim()) + + ret = execute_onnx(model, _make_input_dict(model, patches)) + assert (ret["out"] == expected).all() + + node = model.get_nodes_by_op_type("AddCLSToken_rtl")[0] + inst = getCustomOp(node) + cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim") + exp_cycles_dict = model.analysis(exp_cycles_per_layer) + exp_cycles = exp_cycles_dict[node.name] + assert np.isclose(exp_cycles, cycles_rtlsim, atol=10) + assert exp_cycles != 0 + + +@pytest.mark.fpgadataflow +@pytest.mark.vivado +@pytest.mark.slow +@pytest.mark.parametrize("simd,pad_tokens", [(1, 0), (2, 1)]) +def test_addclstoken_stitched_ip_rtlsim(simd, pad_tokens): + model, cls_values = _prepare_addclstoken_stitched_ip_model( + simd=simd, + pad_tokens=pad_tokens, + ) + patches = np.arange(12, dtype=np.float32).reshape(1, 3, 4) + expected_values = [cls_values, patches] + if pad_tokens > 0: + expected_values.append(np.zeros((1, pad_tokens, 4), dtype=np.float32)) + expected = np.concatenate(expected_values, axis=1) + + model.set_metadata_prop("exec_mode", "rtlsim") + + ret = execute_onnx(model, _make_input_dict(model, patches)) + assert (ret["out"] == expected).all() + + +@pytest.mark.fpgadataflow +@pytest.mark.vivado +@pytest.mark.slow +def test_addclstoken_stitched_ip_synth_ooc(): + model, _ = _prepare_addclstoken_stitched_ip_model(simd=2, pad_tokens=1) + model = model.transform(SynthOutOfContext(FPGA_PART, CLK_NS)) + ret = model.get_metadata_prop("res_total_ooc_synth") + assert ret is not None + ret = eval(ret) + + assert ret["LUT"] > 0 + assert ret["FF"] > 0 + assert ret["DSP"] == 0 + assert ret["BRAM"] == 0 + assert ret["WNS"] >= 0