From 44e66e5a1fea66010c05371fbe15379c99f7c0cb Mon Sep 17 00:00:00 2001 From: ollycassidy13 Date: Tue, 28 Apr 2026 17:31:31 +0100 Subject: [PATCH 1/3] AddCLSToken initial commit --- .../finn.custom_op.fpgadataflow.rst | 8 + .../finn.custom_op.fpgadataflow.rtl.rst | 8 + finn-rtllib/addclstoken/hdl/addclstoken.sv | 150 +++++++++ .../addclstoken/hdl/addclstoken_template.v | 81 +++++ src/finn/builder/build_dataflow_steps.py | 2 + src/finn/custom_op/fpgadataflow/__init__.py | 2 + .../custom_op/fpgadataflow/addclstoken.py | 171 ++++++++++ .../custom_op/fpgadataflow/rtl/__init__.py | 2 + .../fpgadataflow/rtl/addclstoken_rtl.py | 211 ++++++++++++ .../fpgadataflow/convert_to_hw_layers.py | 79 +++++ .../fpgadataflow/specialize_layers.py | 1 + src/finn/util/vivado.py | 44 ++- .../test_fpgadataflow_addclstoken.py | 299 ++++++++++++++++++ 13 files changed, 1050 insertions(+), 8 deletions(-) create mode 100644 finn-rtllib/addclstoken/hdl/addclstoken.sv create mode 100644 finn-rtllib/addclstoken/hdl/addclstoken_template.v create mode 100644 src/finn/custom_op/fpgadataflow/addclstoken.py create mode 100644 src/finn/custom_op/fpgadataflow/rtl/addclstoken_rtl.py create mode 100644 tests/fpgadataflow/test_fpgadataflow_addclstoken.py diff --git a/docs/finn/source_code/finn.custom_op.fpgadataflow.rst b/docs/finn/source_code/finn.custom_op.fpgadataflow.rst index 25aafc324e..0688664bfe 100644 --- a/docs/finn/source_code/finn.custom_op.fpgadataflow.rst +++ b/docs/finn/source_code/finn.custom_op.fpgadataflow.rst @@ -39,6 +39,14 @@ RTLBackend :undoc-members: :show-inheritance: +finn.custom\_op.fpgadataflow.addclstoken +----------------------------------------- + +.. automodule:: finn.custom_op.fpgadataflow.addclstoken + :members: + :undoc-members: + :show-inheritance: + finn.custom\_op.fpgadataflow.addstreams ---------------------------------------- diff --git a/docs/finn/source_code/finn.custom_op.fpgadataflow.rtl.rst b/docs/finn/source_code/finn.custom_op.fpgadataflow.rtl.rst index 346eddb073..859a789f2f 100644 --- a/docs/finn/source_code/finn.custom_op.fpgadataflow.rtl.rst +++ b/docs/finn/source_code/finn.custom_op.fpgadataflow.rtl.rst @@ -5,6 +5,14 @@ Custom Op - fpgadataflow.rtl RTL Custom Op Nodes =================== +finn.custom\_op.fpgadataflow.rtl.addclstoken\_rtl +-------------------------------------------------- + +.. automodule:: finn.custom_op.fpgadataflow.rtl.addclstoken_rtl + :members: + :undoc-members: + :show-inheritance: + finn.custom\_op.fpgadataflow.convolutioninputgenerator\_rtl ------------------------------------------------------------ diff --git a/finn-rtllib/addclstoken/hdl/addclstoken.sv b/finn-rtllib/addclstoken/hdl/addclstoken.sv new file mode 100644 index 0000000000..768b2a9a06 --- /dev/null +++ b/finn-rtllib/addclstoken/hdl/addclstoken.sv @@ -0,0 +1,150 @@ +/****************************************************************************** + * Copyright (C) 2026, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ + +module addclstoken #( + parameter int unsigned NUM_TOKENS = 196, + parameter int unsigned NUM_CHANNELS = 192, + parameter int unsigned SIMD = 1, + parameter int unsigned ELEM_WIDTH = 8, + parameter int unsigned PAD_TOKENS = 0 +)( + input logic clk, + input logic rst, + + output logic irdy, + input logic ivld, + input logic [SIMD*ELEM_WIDTH-1:0] idat, + + input logic ordy, + output logic ovld, + output logic [SIMD*ELEM_WIDTH-1:0] odat, + + input logic [NUM_CHANNELS*ELEM_WIDTH-1:0] cls_data +); + + localparam int unsigned FOLD_WIDTH = SIMD * ELEM_WIDTH; + localparam int unsigned FOLDS_PER_TOKEN = NUM_CHANNELS / SIMD; + localparam int unsigned TOTAL_INPUT_FOLDS = NUM_TOKENS * FOLDS_PER_TOKEN; + localparam int unsigned TOTAL_PAD_FOLDS = PAD_TOKENS * FOLDS_PER_TOKEN; + localparam int unsigned MAX_PHASE_FOLDS = + (TOTAL_INPUT_FOLDS > FOLDS_PER_TOKEN) ? + ((TOTAL_INPUT_FOLDS > TOTAL_PAD_FOLDS) ? + TOTAL_INPUT_FOLDS : TOTAL_PAD_FOLDS) : + ((FOLDS_PER_TOKEN > TOTAL_PAD_FOLDS) ? + FOLDS_PER_TOKEN : TOTAL_PAD_FOLDS); + localparam int unsigned CNT_WIDTH = (MAX_PHASE_FOLDS <= 1) ? 1 : $clog2(MAX_PHASE_FOLDS); + + typedef enum logic [1:0] { + EMIT_CLS, + PASSTHROUGH, + EMIT_PAD + } state_t; + + state_t state; + state_t next_state; + logic [CNT_WIDTH-1:0] fold_cnt; + logic fold_cnt_last; + logic out_transfer; + + logic [CNT_WIDTH-1:0] cls_fold_cnt; + logic [FOLD_WIDTH-1:0] cls_fold; + + assign cls_fold_cnt = (int'(fold_cnt) < FOLDS_PER_TOKEN) ? fold_cnt : '0; + assign cls_fold = cls_data[cls_fold_cnt * FOLD_WIDTH +: FOLD_WIDTH]; + assign out_transfer = ovld & ordy; + + always_comb begin + unique case (state) + EMIT_CLS: fold_cnt_last = (int'(fold_cnt) == FOLDS_PER_TOKEN - 1); + PASSTHROUGH: fold_cnt_last = (int'(fold_cnt) == TOTAL_INPUT_FOLDS - 1); + EMIT_PAD: fold_cnt_last = (int'(fold_cnt) == TOTAL_PAD_FOLDS - 1); + default: fold_cnt_last = 1'b1; + endcase + end + + always_comb begin + irdy = 1'b0; + ovld = 1'b0; + odat = '0; + + unique case (state) + EMIT_CLS: begin + ovld = 1'b1; + odat = cls_fold; + end + PASSTHROUGH: begin + irdy = ordy; + ovld = ivld; + odat = idat; + end + EMIT_PAD: begin + ovld = 1'b1; + end + default: begin + end + endcase + end + + always_comb begin + next_state = state; + if (out_transfer && fold_cnt_last) begin + unique case (state) + EMIT_CLS: begin + next_state = PASSTHROUGH; + end + PASSTHROUGH: begin + next_state = (PAD_TOKENS == 0) ? EMIT_CLS : EMIT_PAD; + end + EMIT_PAD: begin + next_state = EMIT_CLS; + end + default: begin + next_state = EMIT_CLS; + end + endcase + end + end + + always_ff @(posedge clk) begin + if (rst) begin + state <= EMIT_CLS; + fold_cnt <= '0; + end else if (out_transfer) begin + if (fold_cnt_last) begin + state <= next_state; + fold_cnt <= '0; + end else begin + fold_cnt <= fold_cnt + 1'b1; + end + end + end + +endmodule diff --git a/finn-rtllib/addclstoken/hdl/addclstoken_template.v b/finn-rtllib/addclstoken/hdl/addclstoken_template.v new file mode 100644 index 0000000000..57bba51c8d --- /dev/null +++ b/finn-rtllib/addclstoken/hdl/addclstoken_template.v @@ -0,0 +1,81 @@ +/****************************************************************************** + * Copyright (C) 2026, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ + +module $TOP_MODULE_NAME$ #( + parameter FOLD_WIDTH = $FOLD_WIDTH$, + parameter AXI_WIDTH = ((FOLD_WIDTH + 7) / 8) * 8 +)( + (* X_INTERFACE_INFO = "xilinx.com:signal:clock:1.0 ap_clk CLK" *) + (* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF in0_V:out_V, ASSOCIATED_RESET ap_rst_n" *) + input ap_clk, + (* X_INTERFACE_PARAMETER = "POLARITY ACTIVE_LOW" *) + input ap_rst_n, + + output in0_V_TREADY, + input in0_V_TVALID, + input [AXI_WIDTH-1:0] in0_V_TDATA, + + input out_V_TREADY, + output out_V_TVALID, + output [AXI_WIDTH-1:0] out_V_TDATA +); + + localparam [$CLS_WIDTH$-1:0] CLS_DATA = $CLS_DATA$; + + wire [FOLD_WIDTH-1:0] core_out; + + assign out_V_TDATA[FOLD_WIDTH-1:0] = core_out; + + generate + if (AXI_WIDTH > FOLD_WIDTH) begin : gen_pad_tdata + assign out_V_TDATA[AXI_WIDTH-1:FOLD_WIDTH] = {(AXI_WIDTH-FOLD_WIDTH){1'b0}}; + end + endgenerate + + addclstoken #( + .NUM_TOKENS($NUM_TOKENS$), + .NUM_CHANNELS($NUM_CHANNELS$), + .SIMD($SIMD$), + .ELEM_WIDTH($ELEM_WIDTH$), + .PAD_TOKENS($PAD_TOKENS$) + ) impl ( + .clk(ap_clk), + .rst(!ap_rst_n), + .irdy(in0_V_TREADY), + .ivld(in0_V_TVALID), + .idat(in0_V_TDATA[FOLD_WIDTH-1:0]), + .ordy(out_V_TREADY), + .ovld(out_V_TVALID), + .odat(core_out), + .cls_data(CLS_DATA) + ); + +endmodule diff --git a/src/finn/builder/build_dataflow_steps.py b/src/finn/builder/build_dataflow_steps.py index ecc1d28c53..8c2f79c1d6 100644 --- a/src/finn/builder/build_dataflow_steps.py +++ b/src/finn/builder/build_dataflow_steps.py @@ -348,6 +348,8 @@ def step_convert_to_hw(model: ModelWrapper, cfg: DataflowBuildConfig): model = model.transform(to_hw.InferQuantizedMatrixVectorActivation()) # TopK to LabelSelect model = model.transform(to_hw.InferLabelSelectLayer()) + # sequence CLS token insertion + model = model.transform(to_hw.InferAddCLSTokenLayer()) # input quantization (if any) as standalone threshold model = model.transform(to_hw.InferThresholdingLayer()) # needed for convolutions -- TODO always exec? diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py index aed2ab7fe1..c6e8dd1dcc 100644 --- a/src/finn/custom_op/fpgadataflow/__init__.py +++ b/src/finn/custom_op/fpgadataflow/__init__.py @@ -27,6 +27,7 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +from finn.custom_op.fpgadataflow.addclstoken import AddCLSToken from finn.custom_op.fpgadataflow.addstreams import AddStreams from finn.custom_op.fpgadataflow.channelwise_op import ChannelwiseOp from finn.custom_op.fpgadataflow.concat import StreamingConcat @@ -66,6 +67,7 @@ custom_op["StreamingDataflowPartition"] = StreamingDataflowPartition custom_op["AddStreams"] = AddStreams +custom_op["AddCLSToken"] = AddCLSToken custom_op["ChannelwiseOp"] = ChannelwiseOp custom_op["ConvolutionInputGenerator"] = ConvolutionInputGenerator custom_op["DownSampler"] = DownSampler diff --git a/src/finn/custom_op/fpgadataflow/addclstoken.py b/src/finn/custom_op/fpgadataflow/addclstoken.py new file mode 100644 index 0000000000..35eae4bb29 --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/addclstoken.py @@ -0,0 +1,171 @@ +# Copyright (C) 2026, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import numpy as np +import warnings +from qonnx.core.datatype import DataType + +from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp + + +class AddCLSToken(HWCustomOp): + """Prepend a learned class token to a sequence of patch tokens.""" + + def __init__(self, onnx_node, **kwargs): + super().__init__(onnx_node, **kwargs) + + def get_nodeattr_types(self): + my_attrs = super().get_nodeattr_types() + my_attrs.update( + { + "NumTokens": ("i", True, 0), + "NumChannels": ("i", True, 0), + "PadTokens": ("i", False, 0), + "SIMD": ("i", False, 1), + "inputDataType": ("s", True, ""), + "outputDataType": ("s", False, ""), + } + ) + return my_attrs + + def get_normal_input_shape(self, ind=0): + num_channels = self.get_nodeattr("NumChannels") + if ind == 0: + return (1, self.get_nodeattr("NumTokens"), num_channels) + elif ind == 1: + return (1, 1, num_channels) + else: + raise Exception("AddCLSToken only has two inputs") + + def get_folded_input_shape(self, ind=0): + normal_shape = self.get_normal_input_shape(ind) + simd = self.get_nodeattr("SIMD") + num_channels = normal_shape[-1] + assert num_channels % simd == 0, "SIMD must divide NumChannels" + return normal_shape[:-1] + (num_channels // simd, simd) + + def get_normal_output_shape(self, ind=0): + num_tokens = self.get_nodeattr("NumTokens") + num_channels = self.get_nodeattr("NumChannels") + pad_tokens = self.get_nodeattr("PadTokens") + return (1, num_tokens + 1 + pad_tokens, num_channels) + + def get_folded_output_shape(self, ind=0): + normal_shape = self.get_normal_output_shape(ind) + simd = self.get_nodeattr("SIMD") + num_channels = normal_shape[-1] + assert num_channels % simd == 0, "SIMD must divide NumChannels" + return normal_shape[:-1] + (num_channels // simd, simd) + + def make_shape_compatible_op(self, model): + exp_ishape = self.get_normal_input_shape(0) + ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0])) + assert ishape == exp_ishape, "Unexpected input shape for patch tokens." + + exp_wshape = self.get_normal_input_shape(1) + wshape = tuple(model.get_tensor_shape(self.onnx_node.input[1])) + assert wshape == exp_wshape, "Unexpected input shape for CLS token." + + return super().make_const_shape_op(self.get_normal_output_shape()) + + def infer_node_datatype(self, model): + node = self.onnx_node + attr_idt = None + if self.get_nodeattr("inputDataType") != "": + attr_idt = self.get_input_datatype() + + idt = model.get_tensor_datatype(node.input[0]) + if idt is None: + idt = attr_idt + if idt is None: + raise Exception("AddCLSToken input datatype is not set") + + if attr_idt is not None and attr_idt != idt: + warnings.warn( + "inputDataType changing for %s: %s -> %s" % (node.name, str(attr_idt), str(idt)) + ) + self.set_nodeattr("inputDataType", idt.name) + + cls_dt = model.get_tensor_datatype(node.input[1]) + if cls_dt is None: + model.set_tensor_datatype(node.input[1], idt) + else: + assert cls_dt == idt, "CLS token datatype must match input datatype." + + self.set_nodeattr("outputDataType", idt.name) + model.set_tensor_datatype(node.output[0], idt) + + def verify_node(self): + pass + + def get_input_datatype(self, ind=0): + return DataType[self.get_nodeattr("inputDataType")] + + def get_output_datatype(self, ind=0): + odt = self.get_nodeattr("outputDataType") + if odt == "": + return self.get_input_datatype(ind) + return DataType[odt] + + def get_instream_width(self, ind=0): + if ind != 0: + return 0 + return self.get_input_datatype().bitwidth() * self.get_nodeattr("SIMD") + + def get_outstream_width(self, ind=0): + return self.get_output_datatype().bitwidth() * self.get_nodeattr("SIMD") + + def get_number_output_values(self): + return int(np.prod(self.get_folded_output_shape()[:-1])) + + def get_exp_cycles(self): + return int(np.prod(self.get_folded_output_shape()[:-1])) + + def execute_node(self, context, graph): + node = self.onnx_node + patches = context[node.input[0]] + cls_token = context[node.input[1]] + + result = np.concatenate([cls_token, patches], axis=1) + pad_tokens = self.get_nodeattr("PadTokens") + if pad_tokens > 0: + pad_shape = (1, pad_tokens, self.get_nodeattr("NumChannels")) + padding = np.zeros(pad_shape, dtype=result.dtype) + result = np.concatenate([result, padding], axis=1) + + oshape = self.get_normal_output_shape() + context[node.output[0]] = np.asarray(result, dtype=np.float32).reshape(oshape) + + def bram_estimation(self): + return 0 + + def lut_estimation(self): + return int(128 + self.get_nodeattr("NumChannels")) + + def get_op_and_param_counts(self): + return {"param_cls_token": int(self.get_nodeattr("NumChannels"))} diff --git a/src/finn/custom_op/fpgadataflow/rtl/__init__.py b/src/finn/custom_op/fpgadataflow/rtl/__init__.py index 06067a4fca..26ed73e382 100644 --- a/src/finn/custom_op/fpgadataflow/rtl/__init__.py +++ b/src/finn/custom_op/fpgadataflow/rtl/__init__.py @@ -26,6 +26,7 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +from finn.custom_op.fpgadataflow.rtl.addclstoken_rtl import AddCLSToken_rtl from finn.custom_op.fpgadataflow.rtl.convolutioninputgenerator_rtl import ( ConvolutionInputGenerator_rtl, ) @@ -42,6 +43,7 @@ # make sure new HLSCustomOp subclasses are imported here so that they get # registered and plug in correctly into the infrastructure +custom_op["AddCLSToken_rtl"] = AddCLSToken_rtl custom_op["ConvolutionInputGenerator_rtl"] = ConvolutionInputGenerator_rtl custom_op["FMPadding_rtl"] = FMPadding_rtl custom_op["StreamingDataWidthConverter_rtl"] = StreamingDataWidthConverter_rtl diff --git a/src/finn/custom_op/fpgadataflow/rtl/addclstoken_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/addclstoken_rtl.py new file mode 100644 index 0000000000..53e6318f49 --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/rtl/addclstoken_rtl.py @@ -0,0 +1,211 @@ +# Copyright (C) 2026, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import numpy as np +import os +import shutil +from qonnx.core.datatype import DataType + +from finn.custom_op.fpgadataflow.addclstoken import AddCLSToken +from finn.custom_op.fpgadataflow.rtlbackend import RTLBackend +from finn.util.basic import get_rtlsim_trace_depth, make_build_dir +from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy + +try: + from pyverilator import PyVerilator +except ModuleNotFoundError: + PyVerilator = None + + +def _rtlsrc_dir(): + return os.environ["FINN_ROOT"] + "/finn-rtllib/addclstoken/hdl" + + +class AddCLSToken_rtl(AddCLSToken, RTLBackend): + """RTL implementation of AddCLSToken.""" + + def __init__(self, onnx_node, **kwargs): + super().__init__(onnx_node, **kwargs) + + def get_nodeattr_types(self): + my_attrs = {} + my_attrs.update(AddCLSToken.get_nodeattr_types(self)) + my_attrs.update(RTLBackend.get_nodeattr_types(self)) + return my_attrs + + def _pack_value(self, value, dtype): + bitwidth = dtype.bitwidth() + if dtype == DataType["BIPOLAR"]: + int_value = int((value + 1) // 2) + else: + if dtype.is_fixed_point(): + value = value / dtype.scale_factor() + int_value = int(value) + if int_value < 0: + int_value += 1 << bitwidth + return int_value & ((1 << bitwidth) - 1) + + def _pack_cls_token(self, model): + dtype = self.get_input_datatype() + bitwidth = dtype.bitwidth() + num_channels = self.get_nodeattr("NumChannels") + cls_token = model.get_initializer(self.onnx_node.input[1]) + if cls_token is None: + raise Exception("AddCLSToken RTL generation requires a constant CLS token input.") + + cls_token = np.asarray(cls_token, dtype=np.float32) + assert cls_token.shape == self.get_normal_input_shape( + 1 + ), "CLS token shape does not match AddCLSToken attributes." + assert np.vectorize(dtype.allowed)(cls_token).all(), ( + "CLS token values cannot be represented with %s" % dtype.name + ) + packed = 0 + for i, value in enumerate(cls_token.flatten()): + packed |= self._pack_value(value, dtype) << (i * bitwidth) + return "%d'h%x" % (num_channels * bitwidth, packed) + + def generate_hdl(self, model, fpgapart, clk): + simd = self.get_nodeattr("SIMD") + num_channels = self.get_nodeattr("NumChannels") + assert num_channels % simd == 0, "SIMD must divide NumChannels" + + rtlsrc = _rtlsrc_dir() + template_path = rtlsrc + "/addclstoken_template.v" + with open(template_path, "r") as f: + template = f.read() + + topname = self.get_verilog_top_module_name() + self.set_nodeattr("gen_top_module", topname) + + elem_width = self.get_input_datatype().bitwidth() + fold_width = elem_width * simd + code_gen_dict = { + "TOP_MODULE_NAME": topname, + "NUM_TOKENS": self.get_nodeattr("NumTokens"), + "NUM_CHANNELS": num_channels, + "SIMD": simd, + "ELEM_WIDTH": elem_width, + "PAD_TOKENS": self.get_nodeattr("PadTokens"), + "FOLD_WIDTH": fold_width, + "CLS_WIDTH": num_channels * elem_width, + "CLS_DATA": self._pack_cls_token(model), + } + + for key, value in code_gen_dict.items(): + template = template.replace("$%s$" % key, str(value)) + + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + with open(os.path.join(code_gen_dir, topname + ".v"), "w") as f: + f.write(template) + shutil.copy(rtlsrc + "/addclstoken.sv", code_gen_dir) + + self.set_nodeattr("ipgen_path", code_gen_dir) + self.set_nodeattr("ip_path", code_gen_dir) + + def prepare_rtlsim(self): + if PyVerilator is None: + raise ImportError("Installation of PyVerilator is required.") + + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + verilog_files = [ + "addclstoken.sv", + self.get_nodeattr("gen_top_module") + ".v", + ] + sim = PyVerilator.build( + verilog_files, + build_dir=make_build_dir("pyverilator_" + self.onnx_node.name + "_"), + verilog_path=[code_gen_dir], + trace_depth=get_rtlsim_trace_depth(), + top_module_name=self.get_nodeattr("gen_top_module"), + ) + self.set_nodeattr("rtlsim_so", sim.lib._name) + return sim + + def code_generation_ipi(self): + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + sourcefiles = [ + "addclstoken.sv", + self.get_nodeattr("gen_top_module") + ".v", + ] + sourcefiles = [os.path.join(code_gen_dir, f) for f in sourcefiles] + + cmd = [] + for f in sourcefiles: + cmd += ["add_files -norecurse %s" % f] + cmd += [ + "create_bd_cell -type module -reference %s %s" + % (self.get_nodeattr("gen_top_module"), self.onnx_node.name) + ] + return cmd + + def execute_node(self, context, graph): + mode = self.get_nodeattr("exec_mode") + if mode == "cppsim": + AddCLSToken.execute_node(self, context, graph) + elif mode == "rtlsim": + node = self.onnx_node + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + exp_ishape = self.get_normal_input_shape(0) + exp_oshape = self.get_normal_output_shape() + + inp = context[node.input[0]] + assert str(inp.dtype) == "float32", "Input datatype is not float32" + assert inp.shape == exp_ishape, "Input shape does not match expected shape." + + folded_ishape = self.get_folded_input_shape(0) + np.save(os.path.join(code_gen_dir, "input_0.npy"), inp.reshape(folded_ishape).copy()) + + sim = self.get_rtlsim() + export_idt = self.get_input_datatype() + rtlsim_inp = npy_to_rtlsim_input( + os.path.join(code_gen_dir, "input_0.npy"), + export_idt, + self.get_instream_width(), + ) + self.reset_rtlsim(sim) + self.toggle_clk(sim) + rtlsim_output = self.rtlsim(sim, rtlsim_inp) + + odt = self.get_output_datatype() + out_npy = rtlsim_output_to_npy( + rtlsim_output, + os.path.join(code_gen_dir, "output.npy"), + odt, + self.get_folded_output_shape(), + self.get_outstream_width(), + odt.bitwidth(), + ) + context[node.output[0]] = np.asarray(out_npy, dtype=np.float32).reshape(exp_oshape) + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following values ("cppsim", "rtlsim")""".format( + mode + ) + ) diff --git a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py index e14181b140..e486b19ce4 100644 --- a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py +++ b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py @@ -1196,6 +1196,85 @@ def apply(self, model): return (model, graph_modified) +class InferAddCLSTokenLayer(Transformation): + """Convert Concat([cls_token, patches], axis=1) into AddCLSToken.""" + + def apply(self, model): + graph = model.graph + node_ind = 0 + graph_modified = False + for node in graph.node: + node_ind += 1 + if node.op_type != "Concat": + continue + + axis = get_by_name(node.attribute, "axis") + if axis is None or len(node.input) != 2: + continue + + cls_name = node.input[0] + patch_name = node.input[1] + cls_init = model.get_initializer(cls_name) + if cls_init is None or model.get_initializer(patch_name) is not None: + continue + + cls_shape = model.get_tensor_shape(cls_name) + if cls_shape is None: + cls_shape = list(cls_init.shape) + patch_shape = model.get_tensor_shape(patch_name) + if cls_shape is None or patch_shape is None: + continue + if any(x is None for x in list(cls_shape) + list(patch_shape)): + continue + + rank = len(patch_shape) + concat_axis = axis.i if axis.i >= 0 else axis.i + rank + if rank != 3 or concat_axis != 1: + continue + + if len(cls_shape) != 3 or cls_shape[0] != 1 or cls_shape[1] != 1: + continue + if patch_shape[0] != 1 or cls_shape[2] != patch_shape[2]: + continue + + out_shape = model.get_tensor_shape(node.output[0]) + exp_oshape = [1, patch_shape[1] + 1, patch_shape[2]] + if out_shape is not None and list(out_shape) != exp_oshape: + continue + + idt = model.get_tensor_datatype(patch_name) + if idt is None or not idt.is_integer(): + continue + cls_dt = model.get_tensor_datatype(cls_name) + if cls_dt is None: + model.set_tensor_datatype(cls_name, idt) + elif cls_dt != idt: + continue + + new_node = helper.make_node( + "AddCLSToken", + [patch_name, cls_name], + node.output, + domain="finn.custom_op.fpgadataflow", + backend="fpgadataflow", + name="AddCLSToken_" + node.name, + NumTokens=int(patch_shape[1]), + NumChannels=int(patch_shape[2]), + PadTokens=0, + SIMD=1, + inputDataType=idt.name, + outputDataType=idt.name, + ) + graph.node.insert(node_ind, new_node) + graph.node.remove(node) + graph_modified = True + + if graph_modified: + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) + return (model, graph_modified) + + class InferStreamingEltwise(Transformation): """Convert eltwise Sub or Sub -> Abs to StreamingEltwise layer with SubEltwise or AbsDiffEltwise op.""" diff --git a/src/finn/transformation/fpgadataflow/specialize_layers.py b/src/finn/transformation/fpgadataflow/specialize_layers.py index dbcadd1df5..ac26028106 100644 --- a/src/finn/transformation/fpgadataflow/specialize_layers.py +++ b/src/finn/transformation/fpgadataflow/specialize_layers.py @@ -311,6 +311,7 @@ def apply(self, model): node.input, node.output, domain="finn.custom_op.fpgadataflow." + impl_style, + name=node.name, ) # add all attributes for attribute in node.attribute: diff --git a/src/finn/util/vivado.py b/src/finn/util/vivado.py index bc8ca40d88..14cdba54df 100644 --- a/src/finn/util/vivado.py +++ b/src/finn/util/vivado.py @@ -27,10 +27,27 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import os +import re from finn.util.basic import launch_process_helper, which +def _extract_util_from_report(vivado_proj_folder, row_name): + """Extract the Used column for a row in Vivado's utilization report.""" + + log_path = os.path.join(vivado_proj_folder, "vivado.log") + if not os.path.isfile(log_path): + return None + + row_pattern = re.compile(r"^\|\s*%s\s*\|\s*([0-9.]+)\s*\|" % re.escape(row_name)) + with open(log_path, "r") as f: + for line in f: + match = row_pattern.match(line) + if match is not None: + return float(match.group(1)) + return None + + def out_of_context_synth( verilog_dir, top_name, @@ -48,16 +65,17 @@ def out_of_context_synth( raise Exception("vivado is not in PATH, ensure settings64.sh is sourced.") omx_path = os.environ["OHMYXILINX"] script = "vivadocompile.sh" - # vivadocompile.sh - call_omx = "zsh %s/%s %s %s %s %f" % ( - omx_path, - script, + # vivadocompile.sh + # + call_omx = [ + "zsh", + os.path.join(omx_path, script), top_name, + "", clk_name, fpga_part, - float(clk_period_ns), - ) - call_omx = call_omx.split() + "%f" % float(clk_period_ns), + ] launch_process_helper(call_omx, proc_env=os.environ.copy(), cwd=verilog_dir) vivado_proj_folder = "%s/results_%s" % (verilog_dir, top_name) @@ -67,13 +85,23 @@ def out_of_context_synth( res_data = myfile.read().split("\n") ret = {} ret["vivado_proj_folder"] = vivado_proj_folder + util_report_rows = { + "DSP": "DSPs", + } for res_line in res_data: res_fields = res_line.split("=") print(res_fields) try: ret[res_fields[0]] = float(res_fields[1]) except ValueError: - ret[res_fields[0]] = 0 + util_value = None + if res_fields[0] in util_report_rows: + util_value = _extract_util_from_report( + vivado_proj_folder, util_report_rows[res_fields[0]] + ) + if util_value is None: + raise + ret[res_fields[0]] = util_value except IndexError: ret[res_fields[0]] = 0 if ret["WNS"] == 0: diff --git a/tests/fpgadataflow/test_fpgadataflow_addclstoken.py b/tests/fpgadataflow/test_fpgadataflow_addclstoken.py new file mode 100644 index 0000000000..07caadc99f --- /dev/null +++ b/tests/fpgadataflow/test_fpgadataflow_addclstoken.py @@ -0,0 +1,299 @@ +# Copyright (C) 2026, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import pytest + +import numpy as np +import os +from functools import partial +from onnx import TensorProto, helper, numpy_helper +from pathlib import Path +from qonnx.core.datatype import DataType +from qonnx.core.modelwrapper import ModelWrapper +from qonnx.custom_op.registry import getCustomOp +from qonnx.transformation.general import GiveUniqueNodeNames + +from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer +from finn.analysis.fpgadataflow.res_estimation import ( + res_estimation, + res_estimation_complete, +) +from finn.core.onnx_exec import execute_onnx +from finn.transformation.fpgadataflow.convert_to_hw_layers import InferAddCLSTokenLayer +from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP +from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP +from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO +from finn.transformation.fpgadataflow.prepare_ip import PrepareIP +from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim +from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers +from finn.transformation.fpgadataflow.synth_ooc import SynthOutOfContext + +FPGA_PART = "xc7z020clg400-1" +CLK_NS = 10 + + +def _make_graph(nodes, output_shape, cls_values, finn_dtype=DataType["INT8"]): + patch_shape = [1, 3, 4] + patches = helper.make_tensor_value_info("patches", TensorProto.FLOAT, patch_shape) + output = helper.make_tensor_value_info("out", TensorProto.FLOAT, output_shape) + cls_init = numpy_helper.from_array(cls_values.astype(np.float32), name="cls") + graph = helper.make_graph(nodes, "addclstoken_test", [patches], [output], [cls_init]) + model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 11)]) + model = ModelWrapper(model) + for tensor_name in ["patches", "cls", "out"]: + model.set_tensor_datatype(tensor_name, finn_dtype) + return model + + +def _make_concat_model(): + cls_values = np.asarray([[[1, -2, 3, -4]]], dtype=np.float32) + concat = helper.make_node( + "Concat", + ["cls", "patches"], + ["out"], + axis=1, + name="concat_cls", + ) + model = _make_graph([concat], [1, 4, 4], cls_values) + return model, cls_values + + +def _make_addclstoken_model( + pad_tokens=0, + simd=1, + finn_dtype=DataType["INT8"], + cls_values=None, +): + if cls_values is None: + cls_values = np.asarray([[[1, -2, 3, -4]]], dtype=np.float32) + addcls = helper.make_node( + "AddCLSToken", + ["patches", "cls"], + ["out"], + domain="finn.custom_op.fpgadataflow", + backend="fpgadataflow", + name="AddCLSToken_0", + NumTokens=3, + NumChannels=4, + PadTokens=pad_tokens, + SIMD=simd, + inputDataType=finn_dtype.name, + outputDataType=finn_dtype.name, + ) + model = _make_graph([addcls], [1, 4 + pad_tokens, 4], cls_values, finn_dtype) + return model, cls_values + + +def _prepare_addclstoken_stitched_ip_model(simd=1, pad_tokens=0): + model, cls_values = _make_addclstoken_model(pad_tokens=pad_tokens, simd=simd) + model = model.transform(SpecializeLayers(FPGA_PART)) + model = model.transform(InsertFIFO(create_shallow_fifos=True)) + model = model.transform(SpecializeLayers(FPGA_PART)) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(PrepareIP(FPGA_PART, CLK_NS)) + model = model.transform(HLSSynthIP()) + model = model.transform(CreateStitchedIP(FPGA_PART, CLK_NS, vitis=False)) + return model, cls_values + + +@pytest.mark.fpgadataflow +def test_convert_concat_to_addclstoken(): + model, cls_values = _make_concat_model() + patches = np.arange(12, dtype=np.float32).reshape(1, 3, 4) + expected = np.concatenate([cls_values, patches], axis=1) + + ret = execute_onnx(model, {"patches": patches}) + assert (ret["out"] == expected).all() + + model = model.transform(InferAddCLSTokenLayer()) + node = model.graph.node[0] + assert node.op_type == "AddCLSToken" + assert node.domain == "finn.custom_op.fpgadataflow" + assert list(node.input) == ["patches", "cls"] + + inst = getCustomOp(node) + assert inst.get_normal_output_shape() == (1, 4, 4) + assert inst.get_exp_cycles() == 16 + + ret = execute_onnx(model, {"patches": patches}) + assert (ret["out"] == expected).all() + + model = model.transform(SpecializeLayers("xc7z020clg400-1")) + assert model.graph.node[0].op_type == "AddCLSToken_rtl" + assert model.graph.node[0].domain == "finn.custom_op.fpgadataflow.rtl" + assert model.graph.node[0].name == "AddCLSToken_concat_cls" + + +@pytest.mark.fpgadataflow +def test_addclstoken_python_execution_with_padding(): + model, cls_values = _make_addclstoken_model(pad_tokens=2) + patches = np.arange(12, dtype=np.float32).reshape(1, 3, 4) + expected = np.concatenate( + [cls_values, patches, np.zeros((1, 2, 4), dtype=np.float32)], + axis=1, + ) + + ret = execute_onnx(model, {"patches": patches}) + assert (ret["out"] == expected).all() + + +@pytest.mark.fpgadataflow +@pytest.mark.parametrize( + "finn_dtype,cls_values,expected_cls_data", + [ + (DataType["INT8"], np.asarray([[[1, -2, 3, -4]]], dtype=np.float32), "32'hfc03fe01"), + (DataType["UINT4"], np.asarray([[[1, 2, 3, 4]]], dtype=np.float32), "16'h4321"), + (DataType["BIPOLAR"], np.asarray([[[1, -1, 1, -1]]], dtype=np.float32), "4'h5"), + ], +) +def test_addclstoken_rtl_codegen(tmp_path, monkeypatch, finn_dtype, cls_values, expected_cls_data): + if "FINN_ROOT" not in os.environ: + monkeypatch.setenv("FINN_ROOT", str(Path(__file__).resolve().parents[2])) + + model, _ = _make_addclstoken_model( + pad_tokens=1, + simd=2, + finn_dtype=finn_dtype, + cls_values=cls_values, + ) + model = model.transform(SpecializeLayers("xc7z020clg400-1")) + + node = model.graph.node[0] + inst = getCustomOp(node) + inst.set_nodeattr("code_gen_dir_ipgen", str(tmp_path)) + inst.code_generation_ipgen(model, "xc7z020clg400-1", 10) + + topname = inst.get_nodeattr("gen_top_module") + assert topname == "AddCLSToken_0" + wrapper = tmp_path / (topname + ".v") + core = tmp_path / "addclstoken.sv" + assert wrapper.is_file() + assert core.is_file() + wrapper_text = wrapper.read_text() + assert "parameter FOLD_WIDTH = %d" % (2 * finn_dtype.bitwidth()) in wrapper_text + assert ".SIMD(2)" in wrapper_text + assert ".PAD_TOKENS(1)" in wrapper_text + assert "CLS_DATA = %s" % expected_cls_data in wrapper_text + assert "= '0" not in wrapper_text + + ipi_cmds = inst.code_generation_ipi() + assert any("addclstoken.sv" in cmd for cmd in ipi_cmds) + assert any("create_bd_cell" in cmd and topname in cmd for cmd in ipi_cmds) + + +@pytest.mark.fpgadataflow +def test_addclstoken_resource_estimation(): + model, _ = _make_addclstoken_model(pad_tokens=1, simd=2) + model = model.transform(SpecializeLayers(FPGA_PART)) + model = model.transform(GiveUniqueNodeNames()) + + expected = { + "BRAM_18K": 0, + "BRAM_efficiency": 1, + "LUT": 132, + "URAM": 0, + "URAM_efficiency": 1, + "DSP": 0, + } + resources = model.analysis(partial(res_estimation, fpgapart=FPGA_PART)) + assert len(resources) == 1 + assert list(resources.values())[0] == expected + + complete_resources = model.analysis(partial(res_estimation_complete, fpgapart=FPGA_PART)) + assert len(complete_resources) == 1 + assert list(complete_resources.values())[0] == [expected] + + +@pytest.mark.fpgadataflow +@pytest.mark.vivado +@pytest.mark.slow +@pytest.mark.parametrize("simd,pad_tokens", [(1, 0), (2, 1)]) +def test_addclstoken_rtlsim(simd, pad_tokens): + model, cls_values = _make_addclstoken_model(pad_tokens=pad_tokens, simd=simd) + patches = np.arange(12, dtype=np.float32).reshape(1, 3, 4) + expected_values = [cls_values, patches] + if pad_tokens > 0: + expected_values.append(np.zeros((1, pad_tokens, 4), dtype=np.float32)) + expected = np.concatenate(expected_values, axis=1) + + model = model.transform(SpecializeLayers(FPGA_PART)) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(PrepareIP(FPGA_PART, CLK_NS)) + model = model.transform(SetExecMode("rtlsim")) + model = model.transform(PrepareRTLSim()) + + ret = execute_onnx(model, {"patches": patches}) + assert (ret["out"] == expected).all() + + node = model.get_nodes_by_op_type("AddCLSToken_rtl")[0] + inst = getCustomOp(node) + cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim") + exp_cycles_dict = model.analysis(exp_cycles_per_layer) + exp_cycles = exp_cycles_dict[node.name] + assert np.isclose(exp_cycles, cycles_rtlsim, atol=10) + assert exp_cycles != 0 + + +@pytest.mark.fpgadataflow +@pytest.mark.vivado +@pytest.mark.slow +@pytest.mark.parametrize("simd,pad_tokens", [(1, 0), (2, 1)]) +def test_addclstoken_stitched_ip_rtlsim(simd, pad_tokens): + model, cls_values = _prepare_addclstoken_stitched_ip_model( + simd=simd, + pad_tokens=pad_tokens, + ) + patches = np.arange(12, dtype=np.float32).reshape(1, 3, 4) + expected_values = [cls_values, patches] + if pad_tokens > 0: + expected_values.append(np.zeros((1, pad_tokens, 4), dtype=np.float32)) + expected = np.concatenate(expected_values, axis=1) + + model.set_metadata_prop("exec_mode", "rtlsim") + model.set_metadata_prop("extra_verilator_args", str(["-Wno-TIMESCALEMOD"])) + + ret = execute_onnx(model, {"patches": patches}) + assert (ret["out"] == expected).all() + + +@pytest.mark.fpgadataflow +@pytest.mark.vivado +@pytest.mark.slow +def test_addclstoken_stitched_ip_synth_ooc(): + model, _ = _prepare_addclstoken_stitched_ip_model(simd=2, pad_tokens=1) + model = model.transform(SynthOutOfContext(FPGA_PART, CLK_NS)) + ret = model.get_metadata_prop("res_total_ooc_synth") + assert ret is not None + ret = eval(ret) + + assert ret["LUT"] > 0 + assert ret["FF"] > 0 + assert ret["DSP"] == 0 + assert ret["BRAM"] == 0 + assert ret["WNS"] >= 0 From a3eac3899098d61f2a6ec08019f468ed7c2d8a7a Mon Sep 17 00:00:00 2001 From: ollycassidy13 Date: Wed, 29 Apr 2026 09:57:21 +0100 Subject: [PATCH 2/3] header --- finn-rtllib/addclstoken/hdl/addclstoken.sv | 37 ++++++++-------------- 1 file changed, 13 insertions(+), 24 deletions(-) diff --git a/finn-rtllib/addclstoken/hdl/addclstoken.sv b/finn-rtllib/addclstoken/hdl/addclstoken.sv index 768b2a9a06..d5bbdc2188 100644 --- a/finn-rtllib/addclstoken/hdl/addclstoken.sv +++ b/finn-rtllib/addclstoken/hdl/addclstoken.sv @@ -1,33 +1,22 @@ -/****************************************************************************** +/**************************************************************************** * Copyright (C) 2026, Advanced Micro Devices, Inc. * All rights reserved. * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: + * SPDX-License-Identifier: BSD-3-Clause * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. + * @brief Insert a constant class token into a folded token stream. + * @author Oliver Cassidy * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. + * @description + * Prepends a learned class token, supplied through cls_data, to each + * input sequence of patch tokens. The class token and patch tokens are + * transferred as SIMD-wide folds of ELEM_WIDTH-bit elements. * - * 3. Neither the name of the copyright holder nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, - * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; - * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, - * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR - * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF - * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - *****************************************************************************/ + * Per sequence, the output stream is: + * 1. NUM_CHANNELS/SIMD folds from cls_data + * 2. NUM_TOKENS pass-through input tokens + * 3. PAD_TOKENS zero-valued tokens, when padding is enabled + ***************************************************************************/ module addclstoken #( parameter int unsigned NUM_TOKENS = 196, From 658149983145fbbe53ccc8de6bf6c7846806d4eb Mon Sep 17 00:00:00 2001 From: ollycassidy13 Date: Thu, 30 Apr 2026 16:58:07 +0100 Subject: [PATCH 3/3] Address AddCLSToken review comments --- docs/finn/components/rtl-swg.rst | 2 +- docs/finn/developers.rst | 2 +- docs/finn/source_code/finn.builder.rst | 2 +- docs/finn/source_code/finn.core.rst | 2 +- docs/finn/source_code/finn.rst | 2 +- .../fpgadataflow/rtl/addclstoken_rtl.py | 4 ++++ src/finn/custom_op/fpgadataflow/rtlbackend.py | 10 +++++++--- .../fpgadataflow/specialize_layers.py | 1 - .../test_fpgadataflow_addclstoken.py | 19 ++++++++++++------- 9 files changed, 28 insertions(+), 16 deletions(-) diff --git a/docs/finn/components/rtl-swg.rst b/docs/finn/components/rtl-swg.rst index e8db1d2fa7..8d48dc9d5a 100644 --- a/docs/finn/components/rtl-swg.rst +++ b/docs/finn/components/rtl-swg.rst @@ -96,7 +96,7 @@ Dynamic Mode The "default" style also supports a dynamic mode, which provides an interface to change feature map dimensions, stride, or dilation at run-time. See `this pull request `_ for more information. Folding -------- +======= The RTL SWG is supported by the basic automatic folding algorithm in FINN (:py:mod:`finn.transformation.fpgadataflow.set_folding.SetFolding`). Consider the following implications: diff --git a/docs/finn/developers.rst b/docs/finn/developers.rst index 985b86b279..a265c699c9 100644 --- a/docs/finn/developers.rst +++ b/docs/finn/developers.rst @@ -99,7 +99,7 @@ computer, and you should be able to launch the various .tcl scripts or .xpr proj Docker container as well. Linting -------- +======= We use a pre-commit hook to auto-format Python code and check for issues. See https://pre-commit.com/ for installation. Once you have pre-commit, you can install diff --git a/docs/finn/source_code/finn.builder.rst b/docs/finn/source_code/finn.builder.rst index e4dc810e81..caadf3f91f 100644 --- a/docs/finn/source_code/finn.builder.rst +++ b/docs/finn/source_code/finn.builder.rst @@ -3,7 +3,7 @@ Builder ******* Modules -~~~~~~~ +======= finn.builder.build\_dataflow ---------------------------- diff --git a/docs/finn/source_code/finn.core.rst b/docs/finn/source_code/finn.core.rst index 4f16b3ac74..28cb47eaf7 100644 --- a/docs/finn/source_code/finn.core.rst +++ b/docs/finn/source_code/finn.core.rst @@ -3,7 +3,7 @@ Core **** Modules -~~~~~~~ +======= qonnx.core.data\_layout ------------------------- diff --git a/docs/finn/source_code/finn.rst b/docs/finn/source_code/finn.rst index f67dd0fe9c..5547a46623 100644 --- a/docs/finn/source_code/finn.rst +++ b/docs/finn/source_code/finn.rst @@ -6,7 +6,7 @@ The FINN sources are divided into different modules. They are listed below. .. note:: **Some of these functions and modules are located in the `qonnx` repository.** Modules -~~~~~~~ +======= .. toctree:: :maxdepth: 1 diff --git a/src/finn/custom_op/fpgadataflow/rtl/addclstoken_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/addclstoken_rtl.py index 7b3f810cad..8ca3daec88 100644 --- a/src/finn/custom_op/fpgadataflow/rtl/addclstoken_rtl.py +++ b/src/finn/custom_op/fpgadataflow/rtl/addclstoken_rtl.py @@ -135,6 +135,10 @@ def get_rtl_file_list(self, abspath=False): ] return verilog_files + def get_rtlsim_input_indices(self): + """Only patch tokens are streamed; CLS token data is embedded in generated RTL.""" + return [0] + def code_generation_ipi(self): code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") sourcefiles = self.get_rtl_file_list() diff --git a/src/finn/custom_op/fpgadataflow/rtlbackend.py b/src/finn/custom_op/fpgadataflow/rtlbackend.py index 642523f2db..2b8db0310e 100644 --- a/src/finn/custom_op/fpgadataflow/rtlbackend.py +++ b/src/finn/custom_op/fpgadataflow/rtlbackend.py @@ -85,6 +85,10 @@ def code_generation_ipi(self): def code_generation_ipgen(self, model, fpgapart, clk): self.generate_hdl(model, fpgapart, clk) + def get_rtlsim_input_indices(self): + """Return ONNX input indices that are driven as RTLSim input streams.""" + return range(len(self.onnx_node.input)) + def execute_node(self, context, graph): mode = self.get_nodeattr("exec_mode") code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") @@ -92,10 +96,10 @@ def execute_node(self, context, graph): if mode == "rtlsim": node = self.onnx_node inputs = {} - for i, inp in enumerate(node.input): + for i in self.get_rtlsim_input_indices(): + inp = node.input[i] nbits = self.get_instream_width(i) - if nbits == 0: - continue + assert nbits > 0, "RTLSim input stream %d has zero width." % i exp_ishape = tuple(self.get_normal_input_shape(i)) folded_ishape = self.get_folded_input_shape(i) inp_val = context[inp] diff --git a/src/finn/transformation/fpgadataflow/specialize_layers.py b/src/finn/transformation/fpgadataflow/specialize_layers.py index b2a8629789..dcd2472e0a 100644 --- a/src/finn/transformation/fpgadataflow/specialize_layers.py +++ b/src/finn/transformation/fpgadataflow/specialize_layers.py @@ -389,7 +389,6 @@ def apply(self, model): node.input, node.output, domain="finn.custom_op.fpgadataflow." + impl_style, - name=node.name, ) # add all attributes for attribute in node.attribute: diff --git a/tests/fpgadataflow/test_fpgadataflow_addclstoken.py b/tests/fpgadataflow/test_fpgadataflow_addclstoken.py index 766d783271..7e57c3ef0e 100644 --- a/tests/fpgadataflow/test_fpgadataflow_addclstoken.py +++ b/tests/fpgadataflow/test_fpgadataflow_addclstoken.py @@ -120,13 +120,17 @@ def _prepare_addclstoken_stitched_ip_model(simd=1, pad_tokens=0): return model, cls_values +def _make_input_dict(model, patches): + return {model.graph.input[0].name: patches} + + @pytest.mark.fpgadataflow def test_convert_concat_to_addclstoken(): model, cls_values = _make_concat_model() patches = np.arange(12, dtype=np.float32).reshape(1, 3, 4) expected = np.concatenate([cls_values, patches], axis=1) - ret = execute_onnx(model, {"patches": patches}) + ret = execute_onnx(model, _make_input_dict(model, patches)) assert (ret["out"] == expected).all() model = model.transform(InferAddCLSTokenLayer()) @@ -139,13 +143,13 @@ def test_convert_concat_to_addclstoken(): assert inst.get_normal_output_shape() == (1, 4, 4) assert inst.get_exp_cycles() == 16 - ret = execute_onnx(model, {"patches": patches}) + ret = execute_onnx(model, _make_input_dict(model, patches)) assert (ret["out"] == expected).all() model = model.transform(SpecializeLayers(FPGA_PART)) + model = model.transform(GiveUniqueNodeNames()) assert model.graph.node[0].op_type == "AddCLSToken_rtl" assert model.graph.node[0].domain == "finn.custom_op.fpgadataflow.rtl" - assert model.graph.node[0].name == "AddCLSToken_concat_cls" @pytest.mark.fpgadataflow @@ -157,7 +161,7 @@ def test_addclstoken_python_execution_with_padding(): axis=1, ) - ret = execute_onnx(model, {"patches": patches}) + ret = execute_onnx(model, _make_input_dict(model, patches)) assert (ret["out"] == expected).all() @@ -178,6 +182,7 @@ def test_addclstoken_rtl_codegen(tmp_path, finn_dtype, cls_values, expected_cls_ cls_values=cls_values, ) model = model.transform(SpecializeLayers(FPGA_PART)) + model = model.transform(GiveUniqueNodeNames()) node = model.graph.node[0] inst = getCustomOp(node) @@ -185,7 +190,7 @@ def test_addclstoken_rtl_codegen(tmp_path, finn_dtype, cls_values, expected_cls_ inst.code_generation_ipgen(model, FPGA_PART, CLK_NS) topname = inst.get_nodeattr("gen_top_module") - assert topname == "AddCLSToken_0" + assert topname == node.name wrapper = tmp_path / (topname + ".v") core = tmp_path / "addclstoken.sv" assert wrapper.is_file() @@ -244,7 +249,7 @@ def test_addclstoken_rtlsim(simd, pad_tokens): model = model.transform(SetExecMode("rtlsim")) model = model.transform(PrepareRTLSim()) - ret = execute_onnx(model, {"patches": patches}) + ret = execute_onnx(model, _make_input_dict(model, patches)) assert (ret["out"] == expected).all() node = model.get_nodes_by_op_type("AddCLSToken_rtl")[0] @@ -273,7 +278,7 @@ def test_addclstoken_stitched_ip_rtlsim(simd, pad_tokens): model.set_metadata_prop("exec_mode", "rtlsim") - ret = execute_onnx(model, {"patches": patches}) + ret = execute_onnx(model, _make_input_dict(model, patches)) assert (ret["out"] == expected).all()