From 44e66e5a1fea66010c05371fbe15379c99f7c0cb Mon Sep 17 00:00:00 2001
From: ollycassidy13 <ollyj.cassidy@gmail.com>
Date: Tue, 28 Apr 2026 17:31:31 +0100
Subject: [PATCH 1/3] AddCLSToken initial commit

---
 .../finn.custom_op.fpgadataflow.rst           |   8 +
 .../finn.custom_op.fpgadataflow.rtl.rst       |   8 +
 finn-rtllib/addclstoken/hdl/addclstoken.sv    | 150 +++++++++
 .../addclstoken/hdl/addclstoken_template.v    |  81 +++++
 src/finn/builder/build_dataflow_steps.py      |   2 +
 src/finn/custom_op/fpgadataflow/__init__.py   |   2 +
 .../custom_op/fpgadataflow/addclstoken.py     | 171 ++++++++++
 .../custom_op/fpgadataflow/rtl/__init__.py    |   2 +
 .../fpgadataflow/rtl/addclstoken_rtl.py       | 211 ++++++++++++
 .../fpgadataflow/convert_to_hw_layers.py      |  79 +++++
 .../fpgadataflow/specialize_layers.py         |   1 +
 src/finn/util/vivado.py                       |  44 ++-
 .../test_fpgadataflow_addclstoken.py          | 299 ++++++++++++++++++
 13 files changed, 1050 insertions(+), 8 deletions(-)
 create mode 100644 finn-rtllib/addclstoken/hdl/addclstoken.sv
 create mode 100644 finn-rtllib/addclstoken/hdl/addclstoken_template.v
 create mode 100644 src/finn/custom_op/fpgadataflow/addclstoken.py
 create mode 100644 src/finn/custom_op/fpgadataflow/rtl/addclstoken_rtl.py
 create mode 100644 tests/fpgadataflow/test_fpgadataflow_addclstoken.py

diff --git a/docs/finn/source_code/finn.custom_op.fpgadataflow.rst b/docs/finn/source_code/finn.custom_op.fpgadataflow.rst
index 25aafc324e..0688664bfe 100644
--- a/docs/finn/source_code/finn.custom_op.fpgadataflow.rst
+++ b/docs/finn/source_code/finn.custom_op.fpgadataflow.rst
@@ -39,6 +39,14 @@ RTLBackend
    :undoc-members:
    :show-inheritance:
 
+finn.custom\_op.fpgadataflow.addclstoken
+-----------------------------------------
+
+.. automodule:: finn.custom_op.fpgadataflow.addclstoken
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
 finn.custom\_op.fpgadataflow.addstreams
 ----------------------------------------
 
diff --git a/docs/finn/source_code/finn.custom_op.fpgadataflow.rtl.rst b/docs/finn/source_code/finn.custom_op.fpgadataflow.rtl.rst
index 346eddb073..859a789f2f 100644
--- a/docs/finn/source_code/finn.custom_op.fpgadataflow.rtl.rst
+++ b/docs/finn/source_code/finn.custom_op.fpgadataflow.rtl.rst
@@ -5,6 +5,14 @@ Custom Op - fpgadataflow.rtl
 RTL Custom Op Nodes
 ===================
 
+finn.custom\_op.fpgadataflow.rtl.addclstoken\_rtl
+--------------------------------------------------
+
+.. automodule:: finn.custom_op.fpgadataflow.rtl.addclstoken_rtl
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
 finn.custom\_op.fpgadataflow.convolutioninputgenerator\_rtl
 ------------------------------------------------------------
 
diff --git a/finn-rtllib/addclstoken/hdl/addclstoken.sv b/finn-rtllib/addclstoken/hdl/addclstoken.sv
new file mode 100644
index 0000000000..768b2a9a06
--- /dev/null
+++ b/finn-rtllib/addclstoken/hdl/addclstoken.sv
@@ -0,0 +1,150 @@
+/******************************************************************************
+ * Copyright (C) 2026, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+
+module addclstoken #(
+    parameter int unsigned NUM_TOKENS = 196,
+    parameter int unsigned NUM_CHANNELS = 192,
+    parameter int unsigned SIMD = 1,
+    parameter int unsigned ELEM_WIDTH = 8,
+    parameter int unsigned PAD_TOKENS = 0
+)(
+    input  logic clk,
+    input  logic rst,
+
+    output logic irdy,
+    input  logic ivld,
+    input  logic [SIMD*ELEM_WIDTH-1:0] idat,
+
+    input  logic ordy,
+    output logic ovld,
+    output logic [SIMD*ELEM_WIDTH-1:0] odat,
+
+    input  logic [NUM_CHANNELS*ELEM_WIDTH-1:0] cls_data
+);
+
+    localparam int unsigned FOLD_WIDTH = SIMD * ELEM_WIDTH;
+    localparam int unsigned FOLDS_PER_TOKEN = NUM_CHANNELS / SIMD;
+    localparam int unsigned TOTAL_INPUT_FOLDS = NUM_TOKENS * FOLDS_PER_TOKEN;
+    localparam int unsigned TOTAL_PAD_FOLDS = PAD_TOKENS * FOLDS_PER_TOKEN;
+    localparam int unsigned MAX_PHASE_FOLDS =
+        (TOTAL_INPUT_FOLDS > FOLDS_PER_TOKEN) ?
+            ((TOTAL_INPUT_FOLDS > TOTAL_PAD_FOLDS) ?
+                TOTAL_INPUT_FOLDS : TOTAL_PAD_FOLDS) :
+            ((FOLDS_PER_TOKEN > TOTAL_PAD_FOLDS) ?
+                FOLDS_PER_TOKEN : TOTAL_PAD_FOLDS);
+    localparam int unsigned CNT_WIDTH = (MAX_PHASE_FOLDS <= 1) ? 1 : $clog2(MAX_PHASE_FOLDS);
+
+    typedef enum logic [1:0] {
+        EMIT_CLS,
+        PASSTHROUGH,
+        EMIT_PAD
+    } state_t;
+
+    state_t state;
+    state_t next_state;
+    logic [CNT_WIDTH-1:0] fold_cnt;
+    logic fold_cnt_last;
+    logic out_transfer;
+
+    logic [CNT_WIDTH-1:0] cls_fold_cnt;
+    logic [FOLD_WIDTH-1:0] cls_fold;
+
+    assign cls_fold_cnt = (int'(fold_cnt) < FOLDS_PER_TOKEN) ? fold_cnt : '0;
+    assign cls_fold = cls_data[cls_fold_cnt * FOLD_WIDTH +: FOLD_WIDTH];
+    assign out_transfer = ovld & ordy;
+
+    always_comb begin
+        unique case (state)
+            EMIT_CLS:    fold_cnt_last = (int'(fold_cnt) == FOLDS_PER_TOKEN - 1);
+            PASSTHROUGH: fold_cnt_last = (int'(fold_cnt) == TOTAL_INPUT_FOLDS - 1);
+            EMIT_PAD:    fold_cnt_last = (int'(fold_cnt) == TOTAL_PAD_FOLDS - 1);
+            default:     fold_cnt_last = 1'b1;
+        endcase
+    end
+
+    always_comb begin
+        irdy = 1'b0;
+        ovld = 1'b0;
+        odat = '0;
+
+        unique case (state)
+            EMIT_CLS: begin
+                ovld = 1'b1;
+                odat = cls_fold;
+            end
+            PASSTHROUGH: begin
+                irdy = ordy;
+                ovld = ivld;
+                odat = idat;
+            end
+            EMIT_PAD: begin
+                ovld = 1'b1;
+            end
+            default: begin
+            end
+        endcase
+    end
+
+    always_comb begin
+        next_state = state;
+        if (out_transfer && fold_cnt_last) begin
+            unique case (state)
+                EMIT_CLS: begin
+                    next_state = PASSTHROUGH;
+                end
+                PASSTHROUGH: begin
+                    next_state = (PAD_TOKENS == 0) ? EMIT_CLS : EMIT_PAD;
+                end
+                EMIT_PAD: begin
+                    next_state = EMIT_CLS;
+                end
+                default: begin
+                    next_state = EMIT_CLS;
+                end
+            endcase
+        end
+    end
+
+    always_ff @(posedge clk) begin
+        if (rst) begin
+            state <= EMIT_CLS;
+            fold_cnt <= '0;
+        end else if (out_transfer) begin
+            if (fold_cnt_last) begin
+                state <= next_state;
+                fold_cnt <= '0;
+            end else begin
+                fold_cnt <= fold_cnt + 1'b1;
+            end
+        end
+    end
+
+endmodule
diff --git a/finn-rtllib/addclstoken/hdl/addclstoken_template.v b/finn-rtllib/addclstoken/hdl/addclstoken_template.v
new file mode 100644
index 0000000000..57bba51c8d
--- /dev/null
+++ b/finn-rtllib/addclstoken/hdl/addclstoken_template.v
@@ -0,0 +1,81 @@
+/******************************************************************************
+ * Copyright (C) 2026, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+
+module $TOP_MODULE_NAME$ #(
+    parameter FOLD_WIDTH = $FOLD_WIDTH$,
+    parameter AXI_WIDTH = ((FOLD_WIDTH + 7) / 8) * 8
+)(
+    (* X_INTERFACE_INFO = "xilinx.com:signal:clock:1.0 ap_clk CLK" *)
+    (* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF in0_V:out_V, ASSOCIATED_RESET ap_rst_n" *)
+    input ap_clk,
+    (* X_INTERFACE_PARAMETER = "POLARITY ACTIVE_LOW" *)
+    input ap_rst_n,
+
+    output in0_V_TREADY,
+    input in0_V_TVALID,
+    input [AXI_WIDTH-1:0] in0_V_TDATA,
+
+    input out_V_TREADY,
+    output out_V_TVALID,
+    output [AXI_WIDTH-1:0] out_V_TDATA
+);
+
+    localparam [$CLS_WIDTH$-1:0] CLS_DATA = $CLS_DATA$;
+
+    wire [FOLD_WIDTH-1:0] core_out;
+
+    assign out_V_TDATA[FOLD_WIDTH-1:0] = core_out;
+
+    generate
+        if (AXI_WIDTH > FOLD_WIDTH) begin : gen_pad_tdata
+            assign out_V_TDATA[AXI_WIDTH-1:FOLD_WIDTH] = {(AXI_WIDTH-FOLD_WIDTH){1'b0}};
+        end
+    endgenerate
+
+    addclstoken #(
+        .NUM_TOKENS($NUM_TOKENS$),
+        .NUM_CHANNELS($NUM_CHANNELS$),
+        .SIMD($SIMD$),
+        .ELEM_WIDTH($ELEM_WIDTH$),
+        .PAD_TOKENS($PAD_TOKENS$)
+    ) impl (
+        .clk(ap_clk),
+        .rst(!ap_rst_n),
+        .irdy(in0_V_TREADY),
+        .ivld(in0_V_TVALID),
+        .idat(in0_V_TDATA[FOLD_WIDTH-1:0]),
+        .ordy(out_V_TREADY),
+        .ovld(out_V_TVALID),
+        .odat(core_out),
+        .cls_data(CLS_DATA)
+    );
+
+endmodule
diff --git a/src/finn/builder/build_dataflow_steps.py b/src/finn/builder/build_dataflow_steps.py
index ecc1d28c53..8c2f79c1d6 100644
--- a/src/finn/builder/build_dataflow_steps.py
+++ b/src/finn/builder/build_dataflow_steps.py
@@ -348,6 +348,8 @@ def step_convert_to_hw(model: ModelWrapper, cfg: DataflowBuildConfig):
     model = model.transform(to_hw.InferQuantizedMatrixVectorActivation())
     # TopK to LabelSelect
     model = model.transform(to_hw.InferLabelSelectLayer())
+    # sequence CLS token insertion
+    model = model.transform(to_hw.InferAddCLSTokenLayer())
     # input quantization (if any) as standalone threshold
     model = model.transform(to_hw.InferThresholdingLayer())
     # needed for convolutions -- TODO always exec?
diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py
index aed2ab7fe1..c6e8dd1dcc 100644
--- a/src/finn/custom_op/fpgadataflow/__init__.py
+++ b/src/finn/custom_op/fpgadataflow/__init__.py
@@ -27,6 +27,7 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+from finn.custom_op.fpgadataflow.addclstoken import AddCLSToken
 from finn.custom_op.fpgadataflow.addstreams import AddStreams
 from finn.custom_op.fpgadataflow.channelwise_op import ChannelwiseOp
 from finn.custom_op.fpgadataflow.concat import StreamingConcat
@@ -66,6 +67,7 @@
 custom_op["StreamingDataflowPartition"] = StreamingDataflowPartition
 
 custom_op["AddStreams"] = AddStreams
+custom_op["AddCLSToken"] = AddCLSToken
 custom_op["ChannelwiseOp"] = ChannelwiseOp
 custom_op["ConvolutionInputGenerator"] = ConvolutionInputGenerator
 custom_op["DownSampler"] = DownSampler
diff --git a/src/finn/custom_op/fpgadataflow/addclstoken.py b/src/finn/custom_op/fpgadataflow/addclstoken.py
new file mode 100644
index 0000000000..35eae4bb29
--- /dev/null
+++ b/src/finn/custom_op/fpgadataflow/addclstoken.py
@@ -0,0 +1,171 @@
+# Copyright (C) 2026, Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import numpy as np
+import warnings
+from qonnx.core.datatype import DataType
+
+from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp
+
+
+class AddCLSToken(HWCustomOp):
+    """Prepend a learned class token to a sequence of patch tokens."""
+
+    def __init__(self, onnx_node, **kwargs):
+        super().__init__(onnx_node, **kwargs)
+
+    def get_nodeattr_types(self):
+        my_attrs = super().get_nodeattr_types()
+        my_attrs.update(
+            {
+                "NumTokens": ("i", True, 0),
+                "NumChannels": ("i", True, 0),
+                "PadTokens": ("i", False, 0),
+                "SIMD": ("i", False, 1),
+                "inputDataType": ("s", True, ""),
+                "outputDataType": ("s", False, ""),
+            }
+        )
+        return my_attrs
+
+    def get_normal_input_shape(self, ind=0):
+        num_channels = self.get_nodeattr("NumChannels")
+        if ind == 0:
+            return (1, self.get_nodeattr("NumTokens"), num_channels)
+        elif ind == 1:
+            return (1, 1, num_channels)
+        else:
+            raise Exception("AddCLSToken only has two inputs")
+
+    def get_folded_input_shape(self, ind=0):
+        normal_shape = self.get_normal_input_shape(ind)
+        simd = self.get_nodeattr("SIMD")
+        num_channels = normal_shape[-1]
+        assert num_channels % simd == 0, "SIMD must divide NumChannels"
+        return normal_shape[:-1] + (num_channels // simd, simd)
+
+    def get_normal_output_shape(self, ind=0):
+        num_tokens = self.get_nodeattr("NumTokens")
+        num_channels = self.get_nodeattr("NumChannels")
+        pad_tokens = self.get_nodeattr("PadTokens")
+        return (1, num_tokens + 1 + pad_tokens, num_channels)
+
+    def get_folded_output_shape(self, ind=0):
+        normal_shape = self.get_normal_output_shape(ind)
+        simd = self.get_nodeattr("SIMD")
+        num_channels = normal_shape[-1]
+        assert num_channels % simd == 0, "SIMD must divide NumChannels"
+        return normal_shape[:-1] + (num_channels // simd, simd)
+
+    def make_shape_compatible_op(self, model):
+        exp_ishape = self.get_normal_input_shape(0)
+        ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0]))
+        assert ishape == exp_ishape, "Unexpected input shape for patch tokens."
+
+        exp_wshape = self.get_normal_input_shape(1)
+        wshape = tuple(model.get_tensor_shape(self.onnx_node.input[1]))
+        assert wshape == exp_wshape, "Unexpected input shape for CLS token."
+
+        return super().make_const_shape_op(self.get_normal_output_shape())
+
+    def infer_node_datatype(self, model):
+        node = self.onnx_node
+        attr_idt = None
+        if self.get_nodeattr("inputDataType") != "":
+            attr_idt = self.get_input_datatype()
+
+        idt = model.get_tensor_datatype(node.input[0])
+        if idt is None:
+            idt = attr_idt
+        if idt is None:
+            raise Exception("AddCLSToken input datatype is not set")
+
+        if attr_idt is not None and attr_idt != idt:
+            warnings.warn(
+                "inputDataType changing for %s: %s -> %s" % (node.name, str(attr_idt), str(idt))
+            )
+        self.set_nodeattr("inputDataType", idt.name)
+
+        cls_dt = model.get_tensor_datatype(node.input[1])
+        if cls_dt is None:
+            model.set_tensor_datatype(node.input[1], idt)
+        else:
+            assert cls_dt == idt, "CLS token datatype must match input datatype."
+
+        self.set_nodeattr("outputDataType", idt.name)
+        model.set_tensor_datatype(node.output[0], idt)
+
+    def verify_node(self):
+        pass
+
+    def get_input_datatype(self, ind=0):
+        return DataType[self.get_nodeattr("inputDataType")]
+
+    def get_output_datatype(self, ind=0):
+        odt = self.get_nodeattr("outputDataType")
+        if odt == "":
+            return self.get_input_datatype(ind)
+        return DataType[odt]
+
+    def get_instream_width(self, ind=0):
+        if ind != 0:
+            return 0
+        return self.get_input_datatype().bitwidth() * self.get_nodeattr("SIMD")
+
+    def get_outstream_width(self, ind=0):
+        return self.get_output_datatype().bitwidth() * self.get_nodeattr("SIMD")
+
+    def get_number_output_values(self):
+        return int(np.prod(self.get_folded_output_shape()[:-1]))
+
+    def get_exp_cycles(self):
+        return int(np.prod(self.get_folded_output_shape()[:-1]))
+
+    def execute_node(self, context, graph):
+        node = self.onnx_node
+        patches = context[node.input[0]]
+        cls_token = context[node.input[1]]
+
+        result = np.concatenate([cls_token, patches], axis=1)
+        pad_tokens = self.get_nodeattr("PadTokens")
+        if pad_tokens > 0:
+            pad_shape = (1, pad_tokens, self.get_nodeattr("NumChannels"))
+            padding = np.zeros(pad_shape, dtype=result.dtype)
+            result = np.concatenate([result, padding], axis=1)
+
+        oshape = self.get_normal_output_shape()
+        context[node.output[0]] = np.asarray(result, dtype=np.float32).reshape(oshape)
+
+    def bram_estimation(self):
+        return 0
+
+    def lut_estimation(self):
+        return int(128 + self.get_nodeattr("NumChannels"))
+
+    def get_op_and_param_counts(self):
+        return {"param_cls_token": int(self.get_nodeattr("NumChannels"))}
diff --git a/src/finn/custom_op/fpgadataflow/rtl/__init__.py b/src/finn/custom_op/fpgadataflow/rtl/__init__.py
index 06067a4fca..26ed73e382 100644
--- a/src/finn/custom_op/fpgadataflow/rtl/__init__.py
+++ b/src/finn/custom_op/fpgadataflow/rtl/__init__.py
@@ -26,6 +26,7 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+from finn.custom_op.fpgadataflow.rtl.addclstoken_rtl import AddCLSToken_rtl
 from finn.custom_op.fpgadataflow.rtl.convolutioninputgenerator_rtl import (
     ConvolutionInputGenerator_rtl,
 )
@@ -42,6 +43,7 @@
 
 # make sure new HLSCustomOp subclasses are imported here so that they get
 # registered and plug in correctly into the infrastructure
+custom_op["AddCLSToken_rtl"] = AddCLSToken_rtl
 custom_op["ConvolutionInputGenerator_rtl"] = ConvolutionInputGenerator_rtl
 custom_op["FMPadding_rtl"] = FMPadding_rtl
 custom_op["StreamingDataWidthConverter_rtl"] = StreamingDataWidthConverter_rtl
diff --git a/src/finn/custom_op/fpgadataflow/rtl/addclstoken_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/addclstoken_rtl.py
new file mode 100644
index 0000000000..53e6318f49
--- /dev/null
+++ b/src/finn/custom_op/fpgadataflow/rtl/addclstoken_rtl.py
@@ -0,0 +1,211 @@
+# Copyright (C) 2026, Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import numpy as np
+import os
+import shutil
+from qonnx.core.datatype import DataType
+
+from finn.custom_op.fpgadataflow.addclstoken import AddCLSToken
+from finn.custom_op.fpgadataflow.rtlbackend import RTLBackend
+from finn.util.basic import get_rtlsim_trace_depth, make_build_dir
+from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
+
+try:
+    from pyverilator import PyVerilator
+except ModuleNotFoundError:
+    PyVerilator = None
+
+
+def _rtlsrc_dir():
+    return os.environ["FINN_ROOT"] + "/finn-rtllib/addclstoken/hdl"
+
+
+class AddCLSToken_rtl(AddCLSToken, RTLBackend):
+    """RTL implementation of AddCLSToken."""
+
+    def __init__(self, onnx_node, **kwargs):
+        super().__init__(onnx_node, **kwargs)
+
+    def get_nodeattr_types(self):
+        my_attrs = {}
+        my_attrs.update(AddCLSToken.get_nodeattr_types(self))
+        my_attrs.update(RTLBackend.get_nodeattr_types(self))
+        return my_attrs
+
+    def _pack_value(self, value, dtype):
+        bitwidth = dtype.bitwidth()
+        if dtype == DataType["BIPOLAR"]:
+            int_value = int((value + 1) // 2)
+        else:
+            if dtype.is_fixed_point():
+                value = value / dtype.scale_factor()
+            int_value = int(value)
+            if int_value < 0:
+                int_value += 1 << bitwidth
+        return int_value & ((1 << bitwidth) - 1)
+
+    def _pack_cls_token(self, model):
+        dtype = self.get_input_datatype()
+        bitwidth = dtype.bitwidth()
+        num_channels = self.get_nodeattr("NumChannels")
+        cls_token = model.get_initializer(self.onnx_node.input[1])
+        if cls_token is None:
+            raise Exception("AddCLSToken RTL generation requires a constant CLS token input.")
+
+        cls_token = np.asarray(cls_token, dtype=np.float32)
+        assert cls_token.shape == self.get_normal_input_shape(
+            1
+        ), "CLS token shape does not match AddCLSToken attributes."
+        assert np.vectorize(dtype.allowed)(cls_token).all(), (
+            "CLS token values cannot be represented with %s" % dtype.name
+        )
+        packed = 0
+        for i, value in enumerate(cls_token.flatten()):
+            packed |= self._pack_value(value, dtype) << (i * bitwidth)
+        return "%d'h%x" % (num_channels * bitwidth, packed)
+
+    def generate_hdl(self, model, fpgapart, clk):
+        simd = self.get_nodeattr("SIMD")
+        num_channels = self.get_nodeattr("NumChannels")
+        assert num_channels % simd == 0, "SIMD must divide NumChannels"
+
+        rtlsrc = _rtlsrc_dir()
+        template_path = rtlsrc + "/addclstoken_template.v"
+        with open(template_path, "r") as f:
+            template = f.read()
+
+        topname = self.get_verilog_top_module_name()
+        self.set_nodeattr("gen_top_module", topname)
+
+        elem_width = self.get_input_datatype().bitwidth()
+        fold_width = elem_width * simd
+        code_gen_dict = {
+            "TOP_MODULE_NAME": topname,
+            "NUM_TOKENS": self.get_nodeattr("NumTokens"),
+            "NUM_CHANNELS": num_channels,
+            "SIMD": simd,
+            "ELEM_WIDTH": elem_width,
+            "PAD_TOKENS": self.get_nodeattr("PadTokens"),
+            "FOLD_WIDTH": fold_width,
+            "CLS_WIDTH": num_channels * elem_width,
+            "CLS_DATA": self._pack_cls_token(model),
+        }
+
+        for key, value in code_gen_dict.items():
+            template = template.replace("$%s$" % key, str(value))
+
+        code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+        with open(os.path.join(code_gen_dir, topname + ".v"), "w") as f:
+            f.write(template)
+        shutil.copy(rtlsrc + "/addclstoken.sv", code_gen_dir)
+
+        self.set_nodeattr("ipgen_path", code_gen_dir)
+        self.set_nodeattr("ip_path", code_gen_dir)
+
+    def prepare_rtlsim(self):
+        if PyVerilator is None:
+            raise ImportError("Installation of PyVerilator is required.")
+
+        code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+        verilog_files = [
+            "addclstoken.sv",
+            self.get_nodeattr("gen_top_module") + ".v",
+        ]
+        sim = PyVerilator.build(
+            verilog_files,
+            build_dir=make_build_dir("pyverilator_" + self.onnx_node.name + "_"),
+            verilog_path=[code_gen_dir],
+            trace_depth=get_rtlsim_trace_depth(),
+            top_module_name=self.get_nodeattr("gen_top_module"),
+        )
+        self.set_nodeattr("rtlsim_so", sim.lib._name)
+        return sim
+
+    def code_generation_ipi(self):
+        code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+        sourcefiles = [
+            "addclstoken.sv",
+            self.get_nodeattr("gen_top_module") + ".v",
+        ]
+        sourcefiles = [os.path.join(code_gen_dir, f) for f in sourcefiles]
+
+        cmd = []
+        for f in sourcefiles:
+            cmd += ["add_files -norecurse %s" % f]
+        cmd += [
+            "create_bd_cell -type module -reference %s %s"
+            % (self.get_nodeattr("gen_top_module"), self.onnx_node.name)
+        ]
+        return cmd
+
+    def execute_node(self, context, graph):
+        mode = self.get_nodeattr("exec_mode")
+        if mode == "cppsim":
+            AddCLSToken.execute_node(self, context, graph)
+        elif mode == "rtlsim":
+            node = self.onnx_node
+            code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+            exp_ishape = self.get_normal_input_shape(0)
+            exp_oshape = self.get_normal_output_shape()
+
+            inp = context[node.input[0]]
+            assert str(inp.dtype) == "float32", "Input datatype is not float32"
+            assert inp.shape == exp_ishape, "Input shape does not match expected shape."
+
+            folded_ishape = self.get_folded_input_shape(0)
+            np.save(os.path.join(code_gen_dir, "input_0.npy"), inp.reshape(folded_ishape).copy())
+
+            sim = self.get_rtlsim()
+            export_idt = self.get_input_datatype()
+            rtlsim_inp = npy_to_rtlsim_input(
+                os.path.join(code_gen_dir, "input_0.npy"),
+                export_idt,
+                self.get_instream_width(),
+            )
+            self.reset_rtlsim(sim)
+            self.toggle_clk(sim)
+            rtlsim_output = self.rtlsim(sim, rtlsim_inp)
+
+            odt = self.get_output_datatype()
+            out_npy = rtlsim_output_to_npy(
+                rtlsim_output,
+                os.path.join(code_gen_dir, "output.npy"),
+                odt,
+                self.get_folded_output_shape(),
+                self.get_outstream_width(),
+                odt.bitwidth(),
+            )
+            context[node.output[0]] = np.asarray(out_npy, dtype=np.float32).reshape(exp_oshape)
+        else:
+            raise Exception(
+                """Invalid value for attribute exec_mode! Is currently set to: {}
+            has to be set to one of the following values ("cppsim", "rtlsim")""".format(
+                    mode
+                )
+            )
diff --git a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py
index e14181b140..e486b19ce4 100644
--- a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py
+++ b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py
@@ -1196,6 +1196,85 @@ def apply(self, model):
         return (model, graph_modified)
 
 
+class InferAddCLSTokenLayer(Transformation):
+    """Convert Concat([cls_token, patches], axis=1) into AddCLSToken."""
+
+    def apply(self, model):
+        graph = model.graph
+        node_ind = 0
+        graph_modified = False
+        for node in graph.node:
+            node_ind += 1
+            if node.op_type != "Concat":
+                continue
+
+            axis = get_by_name(node.attribute, "axis")
+            if axis is None or len(node.input) != 2:
+                continue
+
+            cls_name = node.input[0]
+            patch_name = node.input[1]
+            cls_init = model.get_initializer(cls_name)
+            if cls_init is None or model.get_initializer(patch_name) is not None:
+                continue
+
+            cls_shape = model.get_tensor_shape(cls_name)
+            if cls_shape is None:
+                cls_shape = list(cls_init.shape)
+            patch_shape = model.get_tensor_shape(patch_name)
+            if cls_shape is None or patch_shape is None:
+                continue
+            if any(x is None for x in list(cls_shape) + list(patch_shape)):
+                continue
+
+            rank = len(patch_shape)
+            concat_axis = axis.i if axis.i >= 0 else axis.i + rank
+            if rank != 3 or concat_axis != 1:
+                continue
+
+            if len(cls_shape) != 3 or cls_shape[0] != 1 or cls_shape[1] != 1:
+                continue
+            if patch_shape[0] != 1 or cls_shape[2] != patch_shape[2]:
+                continue
+
+            out_shape = model.get_tensor_shape(node.output[0])
+            exp_oshape = [1, patch_shape[1] + 1, patch_shape[2]]
+            if out_shape is not None and list(out_shape) != exp_oshape:
+                continue
+
+            idt = model.get_tensor_datatype(patch_name)
+            if idt is None or not idt.is_integer():
+                continue
+            cls_dt = model.get_tensor_datatype(cls_name)
+            if cls_dt is None:
+                model.set_tensor_datatype(cls_name, idt)
+            elif cls_dt != idt:
+                continue
+
+            new_node = helper.make_node(
+                "AddCLSToken",
+                [patch_name, cls_name],
+                node.output,
+                domain="finn.custom_op.fpgadataflow",
+                backend="fpgadataflow",
+                name="AddCLSToken_" + node.name,
+                NumTokens=int(patch_shape[1]),
+                NumChannels=int(patch_shape[2]),
+                PadTokens=0,
+                SIMD=1,
+                inputDataType=idt.name,
+                outputDataType=idt.name,
+            )
+            graph.node.insert(node_ind, new_node)
+            graph.node.remove(node)
+            graph_modified = True
+
+        if graph_modified:
+            model = model.transform(InferShapes())
+            model = model.transform(InferDataTypes())
+        return (model, graph_modified)
+
+
 class InferStreamingEltwise(Transformation):
     """Convert eltwise Sub or Sub -> Abs to StreamingEltwise layer
     with SubEltwise or AbsDiffEltwise op."""
diff --git a/src/finn/transformation/fpgadataflow/specialize_layers.py b/src/finn/transformation/fpgadataflow/specialize_layers.py
index dbcadd1df5..ac26028106 100644
--- a/src/finn/transformation/fpgadataflow/specialize_layers.py
+++ b/src/finn/transformation/fpgadataflow/specialize_layers.py
@@ -311,6 +311,7 @@ def apply(self, model):
                 node.input,
                 node.output,
                 domain="finn.custom_op.fpgadataflow." + impl_style,
+                name=node.name,
             )
             # add all attributes
             for attribute in node.attribute:
diff --git a/src/finn/util/vivado.py b/src/finn/util/vivado.py
index bc8ca40d88..14cdba54df 100644
--- a/src/finn/util/vivado.py
+++ b/src/finn/util/vivado.py
@@ -27,10 +27,27 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import os
+import re
 
 from finn.util.basic import launch_process_helper, which
 
 
+def _extract_util_from_report(vivado_proj_folder, row_name):
+    """Extract the Used column for a row in Vivado's utilization report."""
+
+    log_path = os.path.join(vivado_proj_folder, "vivado.log")
+    if not os.path.isfile(log_path):
+        return None
+
+    row_pattern = re.compile(r"^\|\s*%s\s*\|\s*([0-9.]+)\s*\|" % re.escape(row_name))
+    with open(log_path, "r") as f:
+        for line in f:
+            match = row_pattern.match(line)
+            if match is not None:
+                return float(match.group(1))
+    return None
+
+
 def out_of_context_synth(
     verilog_dir,
     top_name,
@@ -48,16 +65,17 @@ def out_of_context_synth(
         raise Exception("vivado is not in PATH, ensure settings64.sh is sourced.")
     omx_path = os.environ["OHMYXILINX"]
     script = "vivadocompile.sh"
-    # vivadocompile.sh <top-level-entity> <clock-name (optional)> <fpga-part (optional)>
-    call_omx = "zsh %s/%s %s %s %s %f" % (
-        omx_path,
-        script,
+    # vivadocompile.sh <top-level-entity> <fp-ip-tcl-list> <clock-name>
+    #                  <fpga-part> <clk-period-ns>
+    call_omx = [
+        "zsh",
+        os.path.join(omx_path, script),
         top_name,
+        "",
         clk_name,
         fpga_part,
-        float(clk_period_ns),
-    )
-    call_omx = call_omx.split()
+        "%f" % float(clk_period_ns),
+    ]
     launch_process_helper(call_omx, proc_env=os.environ.copy(), cwd=verilog_dir)
 
     vivado_proj_folder = "%s/results_%s" % (verilog_dir, top_name)
@@ -67,13 +85,23 @@ def out_of_context_synth(
         res_data = myfile.read().split("\n")
     ret = {}
     ret["vivado_proj_folder"] = vivado_proj_folder
+    util_report_rows = {
+        "DSP": "DSPs",
+    }
     for res_line in res_data:
         res_fields = res_line.split("=")
         print(res_fields)
         try:
             ret[res_fields[0]] = float(res_fields[1])
         except ValueError:
-            ret[res_fields[0]] = 0
+            util_value = None
+            if res_fields[0] in util_report_rows:
+                util_value = _extract_util_from_report(
+                    vivado_proj_folder, util_report_rows[res_fields[0]]
+                )
+            if util_value is None:
+                raise
+            ret[res_fields[0]] = util_value
         except IndexError:
             ret[res_fields[0]] = 0
     if ret["WNS"] == 0:
diff --git a/tests/fpgadataflow/test_fpgadataflow_addclstoken.py b/tests/fpgadataflow/test_fpgadataflow_addclstoken.py
new file mode 100644
index 0000000000..07caadc99f
--- /dev/null
+++ b/tests/fpgadataflow/test_fpgadataflow_addclstoken.py
@@ -0,0 +1,299 @@
+# Copyright (C) 2026, Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import pytest
+
+import numpy as np
+import os
+from functools import partial
+from onnx import TensorProto, helper, numpy_helper
+from pathlib import Path
+from qonnx.core.datatype import DataType
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.custom_op.registry import getCustomOp
+from qonnx.transformation.general import GiveUniqueNodeNames
+
+from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
+from finn.analysis.fpgadataflow.res_estimation import (
+    res_estimation,
+    res_estimation_complete,
+)
+from finn.core.onnx_exec import execute_onnx
+from finn.transformation.fpgadataflow.convert_to_hw_layers import InferAddCLSTokenLayer
+from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP
+from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
+from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO
+from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
+from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
+from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
+from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers
+from finn.transformation.fpgadataflow.synth_ooc import SynthOutOfContext
+
+FPGA_PART = "xc7z020clg400-1"
+CLK_NS = 10
+
+
+def _make_graph(nodes, output_shape, cls_values, finn_dtype=DataType["INT8"]):
+    patch_shape = [1, 3, 4]
+    patches = helper.make_tensor_value_info("patches", TensorProto.FLOAT, patch_shape)
+    output = helper.make_tensor_value_info("out", TensorProto.FLOAT, output_shape)
+    cls_init = numpy_helper.from_array(cls_values.astype(np.float32), name="cls")
+    graph = helper.make_graph(nodes, "addclstoken_test", [patches], [output], [cls_init])
+    model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 11)])
+    model = ModelWrapper(model)
+    for tensor_name in ["patches", "cls", "out"]:
+        model.set_tensor_datatype(tensor_name, finn_dtype)
+    return model
+
+
+def _make_concat_model():
+    cls_values = np.asarray([[[1, -2, 3, -4]]], dtype=np.float32)
+    concat = helper.make_node(
+        "Concat",
+        ["cls", "patches"],
+        ["out"],
+        axis=1,
+        name="concat_cls",
+    )
+    model = _make_graph([concat], [1, 4, 4], cls_values)
+    return model, cls_values
+
+
+def _make_addclstoken_model(
+    pad_tokens=0,
+    simd=1,
+    finn_dtype=DataType["INT8"],
+    cls_values=None,
+):
+    if cls_values is None:
+        cls_values = np.asarray([[[1, -2, 3, -4]]], dtype=np.float32)
+    addcls = helper.make_node(
+        "AddCLSToken",
+        ["patches", "cls"],
+        ["out"],
+        domain="finn.custom_op.fpgadataflow",
+        backend="fpgadataflow",
+        name="AddCLSToken_0",
+        NumTokens=3,
+        NumChannels=4,
+        PadTokens=pad_tokens,
+        SIMD=simd,
+        inputDataType=finn_dtype.name,
+        outputDataType=finn_dtype.name,
+    )
+    model = _make_graph([addcls], [1, 4 + pad_tokens, 4], cls_values, finn_dtype)
+    return model, cls_values
+
+
+def _prepare_addclstoken_stitched_ip_model(simd=1, pad_tokens=0):
+    model, cls_values = _make_addclstoken_model(pad_tokens=pad_tokens, simd=simd)
+    model = model.transform(SpecializeLayers(FPGA_PART))
+    model = model.transform(InsertFIFO(create_shallow_fifos=True))
+    model = model.transform(SpecializeLayers(FPGA_PART))
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(PrepareIP(FPGA_PART, CLK_NS))
+    model = model.transform(HLSSynthIP())
+    model = model.transform(CreateStitchedIP(FPGA_PART, CLK_NS, vitis=False))
+    return model, cls_values
+
+
+@pytest.mark.fpgadataflow
+def test_convert_concat_to_addclstoken():
+    model, cls_values = _make_concat_model()
+    patches = np.arange(12, dtype=np.float32).reshape(1, 3, 4)
+    expected = np.concatenate([cls_values, patches], axis=1)
+
+    ret = execute_onnx(model, {"patches": patches})
+    assert (ret["out"] == expected).all()
+
+    model = model.transform(InferAddCLSTokenLayer())
+    node = model.graph.node[0]
+    assert node.op_type == "AddCLSToken"
+    assert node.domain == "finn.custom_op.fpgadataflow"
+    assert list(node.input) == ["patches", "cls"]
+
+    inst = getCustomOp(node)
+    assert inst.get_normal_output_shape() == (1, 4, 4)
+    assert inst.get_exp_cycles() == 16
+
+    ret = execute_onnx(model, {"patches": patches})
+    assert (ret["out"] == expected).all()
+
+    model = model.transform(SpecializeLayers("xc7z020clg400-1"))
+    assert model.graph.node[0].op_type == "AddCLSToken_rtl"
+    assert model.graph.node[0].domain == "finn.custom_op.fpgadataflow.rtl"
+    assert model.graph.node[0].name == "AddCLSToken_concat_cls"
+
+
+@pytest.mark.fpgadataflow
+def test_addclstoken_python_execution_with_padding():
+    model, cls_values = _make_addclstoken_model(pad_tokens=2)
+    patches = np.arange(12, dtype=np.float32).reshape(1, 3, 4)
+    expected = np.concatenate(
+        [cls_values, patches, np.zeros((1, 2, 4), dtype=np.float32)],
+        axis=1,
+    )
+
+    ret = execute_onnx(model, {"patches": patches})
+    assert (ret["out"] == expected).all()
+
+
+@pytest.mark.fpgadataflow
+@pytest.mark.parametrize(
+    "finn_dtype,cls_values,expected_cls_data",
+    [
+        (DataType["INT8"], np.asarray([[[1, -2, 3, -4]]], dtype=np.float32), "32'hfc03fe01"),
+        (DataType["UINT4"], np.asarray([[[1, 2, 3, 4]]], dtype=np.float32), "16'h4321"),
+        (DataType["BIPOLAR"], np.asarray([[[1, -1, 1, -1]]], dtype=np.float32), "4'h5"),
+    ],
+)
+def test_addclstoken_rtl_codegen(tmp_path, monkeypatch, finn_dtype, cls_values, expected_cls_data):
+    if "FINN_ROOT" not in os.environ:
+        monkeypatch.setenv("FINN_ROOT", str(Path(__file__).resolve().parents[2]))
+
+    model, _ = _make_addclstoken_model(
+        pad_tokens=1,
+        simd=2,
+        finn_dtype=finn_dtype,
+        cls_values=cls_values,
+    )
+    model = model.transform(SpecializeLayers("xc7z020clg400-1"))
+
+    node = model.graph.node[0]
+    inst = getCustomOp(node)
+    inst.set_nodeattr("code_gen_dir_ipgen", str(tmp_path))
+    inst.code_generation_ipgen(model, "xc7z020clg400-1", 10)
+
+    topname = inst.get_nodeattr("gen_top_module")
+    assert topname == "AddCLSToken_0"
+    wrapper = tmp_path / (topname + ".v")
+    core = tmp_path / "addclstoken.sv"
+    assert wrapper.is_file()
+    assert core.is_file()
+    wrapper_text = wrapper.read_text()
+    assert "parameter FOLD_WIDTH = %d" % (2 * finn_dtype.bitwidth()) in wrapper_text
+    assert ".SIMD(2)" in wrapper_text
+    assert ".PAD_TOKENS(1)" in wrapper_text
+    assert "CLS_DATA = %s" % expected_cls_data in wrapper_text
+    assert "= '0" not in wrapper_text
+
+    ipi_cmds = inst.code_generation_ipi()
+    assert any("addclstoken.sv" in cmd for cmd in ipi_cmds)
+    assert any("create_bd_cell" in cmd and topname in cmd for cmd in ipi_cmds)
+
+
+@pytest.mark.fpgadataflow
+def test_addclstoken_resource_estimation():
+    model, _ = _make_addclstoken_model(pad_tokens=1, simd=2)
+    model = model.transform(SpecializeLayers(FPGA_PART))
+    model = model.transform(GiveUniqueNodeNames())
+
+    expected = {
+        "BRAM_18K": 0,
+        "BRAM_efficiency": 1,
+        "LUT": 132,
+        "URAM": 0,
+        "URAM_efficiency": 1,
+        "DSP": 0,
+    }
+    resources = model.analysis(partial(res_estimation, fpgapart=FPGA_PART))
+    assert len(resources) == 1
+    assert list(resources.values())[0] == expected
+
+    complete_resources = model.analysis(partial(res_estimation_complete, fpgapart=FPGA_PART))
+    assert len(complete_resources) == 1
+    assert list(complete_resources.values())[0] == [expected]
+
+
+@pytest.mark.fpgadataflow
+@pytest.mark.vivado
+@pytest.mark.slow
+@pytest.mark.parametrize("simd,pad_tokens", [(1, 0), (2, 1)])
+def test_addclstoken_rtlsim(simd, pad_tokens):
+    model, cls_values = _make_addclstoken_model(pad_tokens=pad_tokens, simd=simd)
+    patches = np.arange(12, dtype=np.float32).reshape(1, 3, 4)
+    expected_values = [cls_values, patches]
+    if pad_tokens > 0:
+        expected_values.append(np.zeros((1, pad_tokens, 4), dtype=np.float32))
+    expected = np.concatenate(expected_values, axis=1)
+
+    model = model.transform(SpecializeLayers(FPGA_PART))
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(PrepareIP(FPGA_PART, CLK_NS))
+    model = model.transform(SetExecMode("rtlsim"))
+    model = model.transform(PrepareRTLSim())
+
+    ret = execute_onnx(model, {"patches": patches})
+    assert (ret["out"] == expected).all()
+
+    node = model.get_nodes_by_op_type("AddCLSToken_rtl")[0]
+    inst = getCustomOp(node)
+    cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim")
+    exp_cycles_dict = model.analysis(exp_cycles_per_layer)
+    exp_cycles = exp_cycles_dict[node.name]
+    assert np.isclose(exp_cycles, cycles_rtlsim, atol=10)
+    assert exp_cycles != 0
+
+
+@pytest.mark.fpgadataflow
+@pytest.mark.vivado
+@pytest.mark.slow
+@pytest.mark.parametrize("simd,pad_tokens", [(1, 0), (2, 1)])
+def test_addclstoken_stitched_ip_rtlsim(simd, pad_tokens):
+    model, cls_values = _prepare_addclstoken_stitched_ip_model(
+        simd=simd,
+        pad_tokens=pad_tokens,
+    )
+    patches = np.arange(12, dtype=np.float32).reshape(1, 3, 4)
+    expected_values = [cls_values, patches]
+    if pad_tokens > 0:
+        expected_values.append(np.zeros((1, pad_tokens, 4), dtype=np.float32))
+    expected = np.concatenate(expected_values, axis=1)
+
+    model.set_metadata_prop("exec_mode", "rtlsim")
+    model.set_metadata_prop("extra_verilator_args", str(["-Wno-TIMESCALEMOD"]))
+
+    ret = execute_onnx(model, {"patches": patches})
+    assert (ret["out"] == expected).all()
+
+
+@pytest.mark.fpgadataflow
+@pytest.mark.vivado
+@pytest.mark.slow
+def test_addclstoken_stitched_ip_synth_ooc():
+    model, _ = _prepare_addclstoken_stitched_ip_model(simd=2, pad_tokens=1)
+    model = model.transform(SynthOutOfContext(FPGA_PART, CLK_NS))
+    ret = model.get_metadata_prop("res_total_ooc_synth")
+    assert ret is not None
+    ret = eval(ret)
+
+    assert ret["LUT"] > 0
+    assert ret["FF"] > 0
+    assert ret["DSP"] == 0
+    assert ret["BRAM"] == 0
+    assert ret["WNS"] >= 0

From a3eac3899098d61f2a6ec08019f468ed7c2d8a7a Mon Sep 17 00:00:00 2001
From: ollycassidy13 <ollyj.cassidy@gmail.com>
Date: Wed, 29 Apr 2026 09:57:21 +0100
Subject: [PATCH 2/3] header

---
 finn-rtllib/addclstoken/hdl/addclstoken.sv | 37 ++++++++--------------
 1 file changed, 13 insertions(+), 24 deletions(-)

diff --git a/finn-rtllib/addclstoken/hdl/addclstoken.sv b/finn-rtllib/addclstoken/hdl/addclstoken.sv
index 768b2a9a06..d5bbdc2188 100644
--- a/finn-rtllib/addclstoken/hdl/addclstoken.sv
+++ b/finn-rtllib/addclstoken/hdl/addclstoken.sv
@@ -1,33 +1,22 @@
-/******************************************************************************
+/****************************************************************************
  * Copyright (C) 2026, Advanced Micro Devices, Inc.
  * All rights reserved.
  *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
+ * SPDX-License-Identifier: BSD-3-Clause
  *
- *  1. Redistributions of source code must retain the above copyright notice,
- *     this list of conditions and the following disclaimer.
+ * @brief	Insert a constant class token into a folded token stream.
+ * @author	Oliver Cassidy <oliver.cassidy@amd.com>
  *
- *  2. Redistributions in binary form must reproduce the above copyright
- *     notice, this list of conditions and the following disclaimer in the
- *     documentation and/or other materials provided with the distribution.
+ * @description
+ *	Prepends a learned class token, supplied through cls_data, to each
+ *	input sequence of patch tokens. The class token and patch tokens are
+ *	transferred as SIMD-wide folds of ELEM_WIDTH-bit elements.
  *
- *  3. Neither the name of the copyright holder nor the names of its
- *     contributors may be used to endorse or promote products derived from
- *     this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
- * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
- * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
- * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
- * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
- * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
- * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *****************************************************************************/
+ *	Per sequence, the output stream is:
+ *	  1. NUM_CHANNELS/SIMD folds from cls_data
+ *	  2. NUM_TOKENS pass-through input tokens
+ *	  3. PAD_TOKENS zero-valued tokens, when padding is enabled
+ ***************************************************************************/
 
 module addclstoken #(
     parameter int unsigned NUM_TOKENS = 196,

From 658149983145fbbe53ccc8de6bf6c7846806d4eb Mon Sep 17 00:00:00 2001
From: ollycassidy13 <ollyj.cassidy@gmail.com>
Date: Thu, 30 Apr 2026 16:58:07 +0100
Subject: [PATCH 3/3] Address AddCLSToken review comments

---
 docs/finn/components/rtl-swg.rst              |  2 +-
 docs/finn/developers.rst                      |  2 +-
 docs/finn/source_code/finn.builder.rst        |  2 +-
 docs/finn/source_code/finn.core.rst           |  2 +-
 docs/finn/source_code/finn.rst                |  2 +-
 .../fpgadataflow/rtl/addclstoken_rtl.py       |  4 ++++
 src/finn/custom_op/fpgadataflow/rtlbackend.py | 10 +++++++---
 .../fpgadataflow/specialize_layers.py         |  1 -
 .../test_fpgadataflow_addclstoken.py          | 19 ++++++++++++-------
 9 files changed, 28 insertions(+), 16 deletions(-)

diff --git a/docs/finn/components/rtl-swg.rst b/docs/finn/components/rtl-swg.rst
index e8db1d2fa7..8d48dc9d5a 100644
--- a/docs/finn/components/rtl-swg.rst
+++ b/docs/finn/components/rtl-swg.rst
@@ -96,7 +96,7 @@ Dynamic Mode
 The "default" style also supports a dynamic mode, which provides an interface to change feature map dimensions, stride, or dilation at run-time. See `this pull request <https://github.com/Xilinx/finn/pull/688>`_ for more information.
 
 Folding
--------
+=======
 
 The RTL SWG is supported by the basic automatic folding algorithm in FINN (:py:mod:`finn.transformation.fpgadataflow.set_folding.SetFolding`). Consider the following implications:
 
diff --git a/docs/finn/developers.rst b/docs/finn/developers.rst
index 985b86b279..a265c699c9 100644
--- a/docs/finn/developers.rst
+++ b/docs/finn/developers.rst
@@ -99,7 +99,7 @@ computer, and you should be able to launch the various .tcl scripts or .xpr proj
 Docker container as well.
 
 Linting
--------
+=======
 
 We use a pre-commit hook to auto-format Python code and check for issues.
 See https://pre-commit.com/ for installation. Once you have pre-commit, you can install
diff --git a/docs/finn/source_code/finn.builder.rst b/docs/finn/source_code/finn.builder.rst
index e4dc810e81..caadf3f91f 100644
--- a/docs/finn/source_code/finn.builder.rst
+++ b/docs/finn/source_code/finn.builder.rst
@@ -3,7 +3,7 @@ Builder
 *******
 
 Modules
-~~~~~~~
+=======
 
 finn.builder.build\_dataflow
 ----------------------------
diff --git a/docs/finn/source_code/finn.core.rst b/docs/finn/source_code/finn.core.rst
index 4f16b3ac74..28cb47eaf7 100644
--- a/docs/finn/source_code/finn.core.rst
+++ b/docs/finn/source_code/finn.core.rst
@@ -3,7 +3,7 @@ Core
 ****
 
 Modules
-~~~~~~~
+=======
 
 qonnx.core.data\_layout
 -------------------------
diff --git a/docs/finn/source_code/finn.rst b/docs/finn/source_code/finn.rst
index f67dd0fe9c..5547a46623 100644
--- a/docs/finn/source_code/finn.rst
+++ b/docs/finn/source_code/finn.rst
@@ -6,7 +6,7 @@ The FINN sources are divided into different modules. They are listed below.
 .. note:: **Some of these functions and modules are located in the `qonnx` repository.**
 
 Modules
-~~~~~~~
+=======
 
 .. toctree::
    :maxdepth: 1
diff --git a/src/finn/custom_op/fpgadataflow/rtl/addclstoken_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/addclstoken_rtl.py
index 7b3f810cad..8ca3daec88 100644
--- a/src/finn/custom_op/fpgadataflow/rtl/addclstoken_rtl.py
+++ b/src/finn/custom_op/fpgadataflow/rtl/addclstoken_rtl.py
@@ -135,6 +135,10 @@ def get_rtl_file_list(self, abspath=False):
         ]
         return verilog_files
 
+    def get_rtlsim_input_indices(self):
+        """Only patch tokens are streamed; CLS token data is embedded in generated RTL."""
+        return [0]
+
     def code_generation_ipi(self):
         code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
         sourcefiles = self.get_rtl_file_list()
diff --git a/src/finn/custom_op/fpgadataflow/rtlbackend.py b/src/finn/custom_op/fpgadataflow/rtlbackend.py
index 642523f2db..2b8db0310e 100644
--- a/src/finn/custom_op/fpgadataflow/rtlbackend.py
+++ b/src/finn/custom_op/fpgadataflow/rtlbackend.py
@@ -85,6 +85,10 @@ def code_generation_ipi(self):
     def code_generation_ipgen(self, model, fpgapart, clk):
         self.generate_hdl(model, fpgapart, clk)
 
+    def get_rtlsim_input_indices(self):
+        """Return ONNX input indices that are driven as RTLSim input streams."""
+        return range(len(self.onnx_node.input))
+
     def execute_node(self, context, graph):
         mode = self.get_nodeattr("exec_mode")
         code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
@@ -92,10 +96,10 @@ def execute_node(self, context, graph):
         if mode == "rtlsim":
             node = self.onnx_node
             inputs = {}
-            for i, inp in enumerate(node.input):
+            for i in self.get_rtlsim_input_indices():
+                inp = node.input[i]
                 nbits = self.get_instream_width(i)
-                if nbits == 0:
-                    continue
+                assert nbits > 0, "RTLSim input stream %d has zero width." % i
                 exp_ishape = tuple(self.get_normal_input_shape(i))
                 folded_ishape = self.get_folded_input_shape(i)
                 inp_val = context[inp]
diff --git a/src/finn/transformation/fpgadataflow/specialize_layers.py b/src/finn/transformation/fpgadataflow/specialize_layers.py
index b2a8629789..dcd2472e0a 100644
--- a/src/finn/transformation/fpgadataflow/specialize_layers.py
+++ b/src/finn/transformation/fpgadataflow/specialize_layers.py
@@ -389,7 +389,6 @@ def apply(self, model):
                 node.input,
                 node.output,
                 domain="finn.custom_op.fpgadataflow." + impl_style,
-                name=node.name,
             )
             # add all attributes
             for attribute in node.attribute:
diff --git a/tests/fpgadataflow/test_fpgadataflow_addclstoken.py b/tests/fpgadataflow/test_fpgadataflow_addclstoken.py
index 766d783271..7e57c3ef0e 100644
--- a/tests/fpgadataflow/test_fpgadataflow_addclstoken.py
+++ b/tests/fpgadataflow/test_fpgadataflow_addclstoken.py
@@ -120,13 +120,17 @@ def _prepare_addclstoken_stitched_ip_model(simd=1, pad_tokens=0):
     return model, cls_values
 
 
+def _make_input_dict(model, patches):
+    return {model.graph.input[0].name: patches}
+
+
 @pytest.mark.fpgadataflow
 def test_convert_concat_to_addclstoken():
     model, cls_values = _make_concat_model()
     patches = np.arange(12, dtype=np.float32).reshape(1, 3, 4)
     expected = np.concatenate([cls_values, patches], axis=1)
 
-    ret = execute_onnx(model, {"patches": patches})
+    ret = execute_onnx(model, _make_input_dict(model, patches))
     assert (ret["out"] == expected).all()
 
     model = model.transform(InferAddCLSTokenLayer())
@@ -139,13 +143,13 @@ def test_convert_concat_to_addclstoken():
     assert inst.get_normal_output_shape() == (1, 4, 4)
     assert inst.get_exp_cycles() == 16
 
-    ret = execute_onnx(model, {"patches": patches})
+    ret = execute_onnx(model, _make_input_dict(model, patches))
     assert (ret["out"] == expected).all()
 
     model = model.transform(SpecializeLayers(FPGA_PART))
+    model = model.transform(GiveUniqueNodeNames())
     assert model.graph.node[0].op_type == "AddCLSToken_rtl"
     assert model.graph.node[0].domain == "finn.custom_op.fpgadataflow.rtl"
-    assert model.graph.node[0].name == "AddCLSToken_concat_cls"
 
 
 @pytest.mark.fpgadataflow
@@ -157,7 +161,7 @@ def test_addclstoken_python_execution_with_padding():
         axis=1,
     )
 
-    ret = execute_onnx(model, {"patches": patches})
+    ret = execute_onnx(model, _make_input_dict(model, patches))
     assert (ret["out"] == expected).all()
 
 
@@ -178,6 +182,7 @@ def test_addclstoken_rtl_codegen(tmp_path, finn_dtype, cls_values, expected_cls_
         cls_values=cls_values,
     )
     model = model.transform(SpecializeLayers(FPGA_PART))
+    model = model.transform(GiveUniqueNodeNames())
 
     node = model.graph.node[0]
     inst = getCustomOp(node)
@@ -185,7 +190,7 @@ def test_addclstoken_rtl_codegen(tmp_path, finn_dtype, cls_values, expected_cls_
     inst.code_generation_ipgen(model, FPGA_PART, CLK_NS)
 
     topname = inst.get_nodeattr("gen_top_module")
-    assert topname == "AddCLSToken_0"
+    assert topname == node.name
     wrapper = tmp_path / (topname + ".v")
     core = tmp_path / "addclstoken.sv"
     assert wrapper.is_file()
@@ -244,7 +249,7 @@ def test_addclstoken_rtlsim(simd, pad_tokens):
     model = model.transform(SetExecMode("rtlsim"))
     model = model.transform(PrepareRTLSim())
 
-    ret = execute_onnx(model, {"patches": patches})
+    ret = execute_onnx(model, _make_input_dict(model, patches))
     assert (ret["out"] == expected).all()
 
     node = model.get_nodes_by_op_type("AddCLSToken_rtl")[0]
@@ -273,7 +278,7 @@ def test_addclstoken_stitched_ip_rtlsim(simd, pad_tokens):
 
     model.set_metadata_prop("exec_mode", "rtlsim")
 
-    ret = execute_onnx(model, {"patches": patches})
+    ret = execute_onnx(model, _make_input_dict(model, patches))
     assert (ret["out"] == expected).all()