diff --git a/docs/finn/components/index.rst b/docs/finn/components/index.rst index 9ab59297b1..7c8cdf1840 100644 --- a/docs/finn/components/index.rst +++ b/docs/finn/components/index.rst @@ -10,3 +10,4 @@ This section provides detailed documentation for specific FINN hardware componen :maxdepth: 2 rtl-swg + pwpolyf diff --git a/docs/finn/components/pwpolyf.rst b/docs/finn/components/pwpolyf.rst new file mode 100644 index 0000000000..0259f35450 --- /dev/null +++ b/docs/finn/components/pwpolyf.rst @@ -0,0 +1,272 @@ +PWPolyF Piecewise Polynomial Activation +======================================= + +Overview +-------- + +PWPolyF is a hardware activation layer that approximates nonlinear functions +(GELU, SiLU, Sigmoid, Tanh) using piecewise polynomials evaluated with +Horner's method on a chain of DSPFP32 FMA units. With the default degree of 2, +this uses two cascaded DSPs and one RAMB18 coefficient ROM per PE, giving +single-cycle-per-element throughput. Per-function configuration, including +clamping behaviour and polynomial coefficients, is delivered through a +SystemVerilog package (``pwpolyf_pkg``) using a ``func_cfg_t`` struct. + +The input domain is partitioned into ``1 + 2*5*(2^K)`` segments: one near-zero +region, positive octave sub-segments, and negative mirrors. With the default +``K=3`` this gives 81 segments. Segment selection reuses the FP32 exponent and +mantissa bit fields directly, matching the RTL implementation. + +Polynomial coefficients are generated at HDL build time by +``PWPolyF_rtl._generate_coeffs_pkg()``, which fits polynomials of the +configured degree to the reference PyTorch functions and writes +``pwpolyf_pkg.sv``. Both ``K`` and ``degree`` are configurable. They default to +``K=3`` and ``degree=2`` when inferred from standard ONNX ops. + +Architecture +------------ + +PWPolyF is RTL-only, with no HLS variant, and targets Versal devices only. The +RTL instantiates the Versal DSPFP32 primitive, so UltraScale+ and older parts +must not be specialized to this backend. + +Two export paths are supported: + +.. code-block:: text + + Path A: PiecewisePolyActivation Path B: nn.GELU / nn.SiLU / etc. + | torch.onnx.export | torch.onnx.export + | (dynamo=False) | (dynamo=True or False) + v v + PWPolyF custom ONNX node Standard ONNX ops (Gelu, Sigmoid, + | Tanh, Sigmoid+Mul for SiLU, + | Div+Erf+Add+Mul+Mul for GELU) + | | + +------------- both paths -------------+ + | + InferPWPolyFLayer + v + PWPolyF HW op (finn.custom_op.fpgadataflow) + | SpecializeLayers + v + PWPolyF_rtl (finn.custom_op.fpgadataflow.rtl) + | generate_hdl + v + finn-rtllib/pwpolyf/hdl/ SystemVerilog IP + +Standard ONNX Op Inference +-------------------------- + +``InferPWPolyFLayer`` recognises standard ONNX activation ops in addition to +the explicit ``PWPolyF`` custom op. This allows models that use ``nn.GELU``, +``nn.SiLU``, ``nn.Sigmoid``, or ``nn.Tanh`` to be exported with ``dynamo=True`` +or ``dynamo=False`` and automatically converted to PWPolyF HW layers. + +.. list-table:: + :header-rows: 1 + :widths: 20 45 20 + + * - ONNX op type + - Pattern + - Maps to + * - ``Gelu`` (opset 20+) + - Single node + - ``func="gelu"`` + * - ``Div`` + ``Erf`` + ``Add`` + ``Mul`` + ``Mul`` + - ``x * 0.5 * (1 + erf(x / sqrt(2)))`` + - ``func="gelu"`` + * - ``Sigmoid`` + - Single node (standalone) + - ``func="sigmoid"`` + * - ``Tanh`` + - Single node + - ``func="tanh"`` + * - ``Sigmoid`` + ``Mul`` + - ``Mul(x, Sigmoid(x))`` + - ``func="silu"`` + +``Gelu`` as a single ONNX node requires opset 20 or later. With lower opsets, +including ``dynamo=True`` export defaults to opset 18, GELU decomposes into a +5-node Erf-based pattern. Both forms are matched. SiLU has no standard ONNX op +and decomposes to ``Sigmoid(x) * x``. Only FLOAT32 inputs are converted. + +Folding +------- + +PWPolyF uses PE parallelism. ``NumChannels % PE == 0`` must hold. Each PE +instantiates its own polynomial evaluation pipeline with ``degree`` DSPs. +``SetFolding`` handles PE selection automatically. + +.. list-table:: + :header-rows: 1 + :widths: 10 10 15 15 15 25 + + * - PE + - Degree + - DSPs + - BRAM18s + - Approx LUTs + - Cycles per spatial position + * - 1 + - 2 + - 2 + - 1 + - 200 + - NumChannels + * - C + - 2 + - 2C + - C + - 200C + - 1 + * - 1 + - 3 + - 3 + - 2 + - 300 + - NumChannels + +Resource Estimates +------------------ + +* DSP: ``degree * PE`` (one FP32 FMA stage per polynomial degree per PE) +* LUT: approximately ``100 * degree * PE`` for segment address decode and + control +* BRAM18: ``(degree - 1) * PE`` for default ``K=3``. Vivado infers delayed + coefficient lookups as 32-bit ROMs. +* URAM: 0 + +ONNX Export +----------- + +Two export paths are supported: + +* ``PiecewisePolyActivation`` exports as a single ``PWPolyF`` custom op via + ``torch.autograd.Function.symbolic()``. It requires ``dynamo=False`` and + preserves the ``K`` attribute on the ONNX node. +* Standard PyTorch modules (``nn.GELU``, ``nn.SiLU``, ``nn.Sigmoid``, + ``nn.Tanh``) export with ``dynamo=True`` or ``dynamo=False`` and produce + standard ONNX ops that ``InferPWPolyFLayer`` converts to PWPolyF with + default ``K=3``. + +Attributes on the explicit PWPolyF ONNX node are: + +* ``func``: one of ``gelu``, ``silu``, ``sigmoid``, ``tanh`` +* ``K``: mantissa subdivision bits, default 3 + +Node Attributes +--------------- + +.. list-table:: + :header-rows: 1 + :widths: 25 15 45 + + * - Attribute + - Type + - Description + * - ``func`` + - string + - Activation function name + * - ``K`` + - int + - Mantissa subdivision bits, default 3 + * - ``degree`` + - int + - Polynomial degree / FMA stages, default 2 + * - ``NumChannels`` + - int + - Number of channels in the last input dimension + * - ``PE`` + - int + - Processing elements + * - ``inputDataType`` + - string + - Input data type, always FLOAT32 + * - ``outputDataType`` + - string + - Output data type, always FLOAT32 + * - ``numInputVectors`` + - ints + - Batch/spatial dimensions + +Supported Functions +------------------- + +.. list-table:: + :header-rows: 1 + :widths: 20 20 30 + + * - Function + - Negative clamp + - Positive behaviour + * - GELU + - 0.0 + - passthrough (``y=x``) + * - SiLU + - 0.0 + - passthrough (``y=x``) + * - Sigmoid + - 0.0 + - clamp to 1.0 + * - Tanh + - -1.0 + - clamp to 1.0 + +Files +----- + +Python files: + +.. list-table:: + :header-rows: 1 + :widths: 35 50 + + * - File + - Purpose + * - ``util/torch_hw_modules.py`` + - PyTorch activation module, ONNX export, software simulation + * - ``custom_op/fpgadataflow/pwpolyf.py`` + - Base HW op for shape, folding, resource estimates, cppsim + * - ``custom_op/fpgadataflow/rtl/pwpolyf_rtl.py`` + - RTL backend for HDL generation, package generation, rtlsim, IPI + * - ``util/pwpolyf.py`` + - Compatibility imports for existing PWPolyF utility users + * - ``transformation/fpgadataflow/convert_to_hw_layers.py`` + - ``InferPWPolyFLayer`` transformation + * - ``builder/build_dataflow_steps.py`` + - Build pipeline integration + * - ``transformation/fpgadataflow/set_folding.py`` + - Folding support + +RTL files: + +.. list-table:: + :header-rows: 1 + :widths: 35 50 + + * - File + - Purpose + * - ``finn-rtllib/pwpolyf/hdl/pwpolyf_pkg.sv`` + - ``func_cfg_t`` struct per activation, regenerated per K + * - ``finn-rtllib/pwpolyf/hdl/pwpolyf.sv`` + - Polynomial evaluation pipeline using a Horner chain on DSPFP32 + * - ``finn-rtllib/pwpolyf/hdl/queue.sv`` + - Elastic FIFO for backpressure + * - ``finn-rtllib/pwpolyf/hdl/pwpolyf_template_wrapper.v`` + - AXI-Stream wrapper template + +Tests +----- + +``tests/fpgadataflow/test_fpgadataflow_pwpolyf.py`` covers: + +* cppsim for all supported functions, channel counts, spatial shapes, and + foldings +* ONNX export for the explicit ``PiecewisePolyActivation`` path +* ``InferPWPolyFLayer`` conversion and execution +* standard op inference for Gelu, Sigmoid, Tanh, SiLU, and Erf-based GELU +* execution correctness against ``PiecewisePolyActivation`` +* Versal-only specialization checks +* resource estimates, folded shapes, and expected cycles +* coefficient package generation for ``K`` and ``degree`` +* Vivado HDL generation, RTL simulation, and stitched IP simulation diff --git a/docs/finn/reference/folding-constraints.rst b/docs/finn/reference/folding-constraints.rst index 8300a78ce4..67fafaacd3 100644 --- a/docs/finn/reference/folding-constraints.rst +++ b/docs/finn/reference/folding-constraints.rst @@ -68,6 +68,9 @@ Constraint Table * - Pool - PE - inp_channels % PE == 0 + * - PWPolyF + - PE + - NumChannels % PE == 0 * - Thresholding - PE - MH % PE == 0 diff --git a/docs/finn/source_code/finn.custom_op.fpgadataflow.rst b/docs/finn/source_code/finn.custom_op.fpgadataflow.rst index 84e9633304..1dc2d71f1c 100644 --- a/docs/finn/source_code/finn.custom_op.fpgadataflow.rst +++ b/docs/finn/source_code/finn.custom_op.fpgadataflow.rst @@ -136,6 +136,15 @@ finn.custom\_op.fpgadataflow.pool :undoc-members: :show-inheritance: +finn.custom\_op.fpgadataflow.pwpolyf +-------------------------------------- + +.. automodule:: finn.custom_op.fpgadataflow.pwpolyf + :members: + :undoc-members: + :show-inheritance: + + finn.custom\_op.fpgadataflow.streamingdataflowpartition -------------------------------------------------------- diff --git a/docs/finn/source_code/finn.custom_op.fpgadataflow.rtl.rst b/docs/finn/source_code/finn.custom_op.fpgadataflow.rtl.rst index 346eddb073..e31176462f 100644 --- a/docs/finn/source_code/finn.custom_op.fpgadataflow.rtl.rst +++ b/docs/finn/source_code/finn.custom_op.fpgadataflow.rtl.rst @@ -45,6 +45,14 @@ finn.custom\_op.fpgadataflow.streamingfifo\_rtl :undoc-members: :show-inheritance: +finn.custom\_op.fpgadataflow.pwpolyf\_rtl +-------------------------------------------- + +.. automodule:: finn.custom_op.fpgadataflow.rtl.pwpolyf_rtl + :members: + :undoc-members: + :show-inheritance: + finn.custom\_op.fpgadataflow.thresholding\_rtl ------------------------------------------------------- diff --git a/docs/finn/source_code/finn.util.rst b/docs/finn/source_code/finn.util.rst index 2115037df3..5ceebc2436 100644 --- a/docs/finn/source_code/finn.util.rst +++ b/docs/finn/source_code/finn.util.rst @@ -188,6 +188,24 @@ finn.util.pytorch :show-inheritance: +finn.util.torch_hw_modules +--------------------------- + +.. automodule:: finn.util.torch_hw_modules + :members: + :undoc-members: + :show-inheritance: + + +finn.util.pwpolyf +------------------- + +.. automodule:: finn.util.pwpolyf + :members: + :undoc-members: + :show-inheritance: + + finn.util.test --------------------- diff --git a/finn-rtllib/pwpolyf/hdl/pwpolyf.abc b/finn-rtllib/pwpolyf/hdl/pwpolyf.abc new file mode 100644 index 0000000000..c25b5fda3d --- /dev/null +++ b/finn-rtllib/pwpolyf/hdl/pwpolyf.abc @@ -0,0 +1,5 @@ +import queue +read_sv pwpolyf_pkg.sv +read_sv pwpolyf.sv +setup_tb pwpolyf_tb +setup_top pwpolyf diff --git a/finn-rtllib/pwpolyf/hdl/pwpolyf.sv b/finn-rtllib/pwpolyf/hdl/pwpolyf.sv new file mode 100644 index 0000000000..32c0b5ea6b --- /dev/null +++ b/finn-rtllib/pwpolyf/hdl/pwpolyf.sv @@ -0,0 +1,326 @@ +/**************************************************************************** + * Copyright (C) 2026, Advanced Micro Devices, Inc. + * All rights reserved. + * + * SPDX-License-Identifier: BSD-3-Clause + * + * @brief FP32 piecewise polynomial activation on DSPFP32. + * @author Shane Fleming + * + * @description + * Supports GELU, SiLU, Sigmoid, and Tanh via `parameter string FUNC`. + * + * Approximated by piecewise degree-D polynomials over segments defined + * by FP32 bit-extraction, where D = DEGREE from pwpolyf_pkg. + * Evaluated via Horner's method on a chain + * of D DSPFP32 instances, each computing FMA: out = C + A*B. + * + * Horner (degree D): y = a_0 + x*(a_1 + x*(... + x*a_D)) + * Stage 0: out = a_{D-1} + a_D * x + * Stage j: out = a_{D-1-j} + prev * x (j = 1 .. D-1) + * + * Clamping for |x| >= 8 (5 octaves): + * GELU/SiLU: neg -> 0, pos -> x (pass-through) + * Sigmoid: neg -> 0, pos -> 1.0 + * Tanh: neg -> -1, pos -> 1.0 + * + * Latency: D * DSP_LAT cycles (D DSP stages x 4 cycles each). II=1. + ***************************************************************************/ + +//===----------------------------------------------------------------------===// +// Single DSPFP32 FMA wrapper: r = c + a * b +//===----------------------------------------------------------------------===// +module pwpolyf_dspfp32 ( + input logic clk, + input logic rst, + + input logic [31:0] a, + input logic [31:0] b, + input logic [31:0] c, + + output logic [31:0] r, + input logic rvld +); + + // FMA opmode: FPA_OUT = C + A*B + // FPOPMODE[6:5] = 00 (no sign flip on C or M) + // FPOPMODE[4:2] = 110 (select C for W mux, M for Z mux -- add path) + // FPOPMODE[1:0] = 01 (FP mode enable) + localparam logic [6:0] MODE_FMA = 7'b00_110_01; + + uwire invalid; + uwire overflow; + uwire underflow; + + DSPFP32 #( + .A_FPTYPE("B32"), + .A_INPUT("DIRECT"), + .BCASCSEL("B"), + .B_D_FPTYPE("B32"), + .B_INPUT("DIRECT"), + .PCOUTSEL("FPA"), + .USE_MULT("MULTIPLY"), + .IS_CLK_INVERTED(1'b0), + .IS_FPINMODE_INVERTED(1'b0), + .IS_FPOPMODE_INVERTED(7'b0000000), + .IS_RSTA_INVERTED(1'b0), + .IS_RSTB_INVERTED(1'b0), + .IS_RSTC_INVERTED(1'b0), + .IS_RSTD_INVERTED(1'b0), + .IS_RSTFPA_INVERTED(1'b0), + .IS_RSTFPINMODE_INVERTED(1'b0), + .IS_RSTFPMPIPE_INVERTED(1'b0), + .IS_RSTFPM_INVERTED(1'b0), + .IS_RSTFPOPMODE_INVERTED(1'b0), + .ACASCREG(1), + .AREG(1), + .FPA_PREG(1), + .FPBREG(1), + .FPCREG(3), // C needs 3 pipeline stages to align with M output + .FPDREG(0), + .FPMPIPEREG(1), + .FPM_PREG(1), + .FPOPMREG(0), + .INMODEREG(0), + .RESET_MODE("SYNC") + ) DSPFP32_inst ( + .ACOUT_EXP(), .ACOUT_MAN(), .ACOUT_SIGN(), + .BCOUT_EXP(), .BCOUT_MAN(), .BCOUT_SIGN(), + .PCOUT(), + .FPM_INVALID(), .FPM_OVERFLOW(), .FPM_UNDERFLOW(), .FPM_OUT(), + .FPA_INVALID(invalid), .FPA_OVERFLOW(overflow), .FPA_UNDERFLOW(underflow), .FPA_OUT(r), + .ACIN_EXP('x), .ACIN_MAN('x), .ACIN_SIGN('x), + .BCIN_EXP('x), .BCIN_MAN('x), .BCIN_SIGN('x), + .PCIN('x), + .CLK(clk), + .FPINMODE('1), // Select B path (not D) + .FPOPMODE(MODE_FMA), + .A_SIGN(a[31]), .A_EXP(a[30:23]), .A_MAN(a[22:0]), + .B_SIGN(b[31]), .B_EXP(b[30:23]), .B_MAN(b[22:0]), + .C(c), + .D_SIGN('x), .D_EXP('x), .D_MAN('x), + .ASYNC_RST('0), + .CEA1('0), .CEA2('1), + .CEB('1), .CEC('1), .CED('0), + .CEFPA('1), .CEFPINMODE('0), .CEFPM('1), .CEFPMPIPE('1), .CEFPOPMODE('0), + .RSTA('0), .RSTB('0), .RSTC('0), .RSTD('0), + .RSTFPA('0), .RSTFPINMODE('0), .RSTFPM('0), .RSTFPMPIPE('0), .RSTFPOPMODE('0) + ); + + // Simulation-time warnings + always_ff @(posedge clk) begin + if(!rst && rvld) begin + assert(!invalid) else $warning("%m generated invalid output."); + assert(!overflow) else $warning("%m generated an overflow."); + assert(!underflow) else $warning("%m generated an underflow."); + end + end + +endmodule : pwpolyf_dspfp32 + +//===----------------------------------------------------------------------===// +// Full PE-wide streaming activation with piecewise polynomial approximation. +// Degree D derived from DEGREE in pwpolyf_pkg. +//===----------------------------------------------------------------------===// +module pwpolyf #( + int unsigned PE = 1, + string FUNC = "gelu" +)( + // Global Control + input logic clk, + input logic rst, + + // Input Stream - PE elements wide + input logic [PE-1:0][31:0] xdat, + input logic xvld, + output logic xrdy, + + // Output Stream - PE elements wide + output logic [PE-1:0][31:0] ydat, + output logic yvld, + input logic yrdy +); + + import pwpolyf_pkg::*; + + localparam int unsigned NUM_SUBS = 1 << K; + localparam int unsigned DSP_LAT = 4; + localparam int unsigned LATENCY = DEGREE * DSP_LAT; + + initial begin + assert(DEGREE >= 1) else begin + $error("%m: DEGREE must be >= 1."); + $finish; + end + assert(FUNC == "gelu" || FUNC == "silu" || FUNC == "sigmoid" || FUNC == "tanh") else begin + $error("%m: Unsupported FUNC=\"%s\". Must be gelu|silu|sigmoid|tanh.", FUNC); + $finish; + end + end + + //=== Per-activation configuration ======================================= + localparam func_cfg_t CFG = + FUNC == "gelu" ? GELU : + FUNC == "silu" ? SILU : + FUNC == "sigmoid" ? SIGMOID : + TANH; + + //=== Clamping exponent threshold ========================================= + localparam int unsigned EXP_CLAMP = 130; // |x| >= 8.0 + + //=== Input Sidestep Register ============================================= + typedef logic [PE-1:0][31:0] fp_vec_t; + + uwire take; + + typedef struct { + fp_vec_t val; + logic rdy; + } ibuf_t; + ibuf_t Ibuf = '{ val: 'x, rdy: '1 }; + always_ff @(posedge clk) begin + if(rst) + Ibuf <= '{ val: 'x, rdy: '1 }; + else begin + if(Ibuf.rdy) Ibuf.val <= xdat; + Ibuf.rdy <= (Ibuf.rdy && !xvld) || take; + end + end + assign xrdy = Ibuf.rdy; + uwire fp_vec_t x_cur = Ibuf.rdy? xdat : Ibuf.val; + + //=== Credit-based Operation Issue ======================================== + localparam int unsigned CREDIT = LATENCY + 3; // pipeline + sidestep + queue read + logic signed [$clog2(CREDIT):0] Credit = -CREDIT; + uwire give = yvld && yrdy; + assign take = (xvld || !xrdy) && Credit[$left(Credit)]; + always_ff @(posedge clk) begin + if(rst) Credit <= -CREDIT; + else Credit <= Credit + (give == take? 0 : give? -1 : 1); + end + + //=== Per-PE Compute Pipeline ============================================= + uwire fp_vec_t r; + uwire [PE-1:0] rvld_vec; + uwire rvld; + + for(genvar pe = 0; pe < PE; pe++) begin : gen_pe + uwire [31:0] xi = x_cur[pe]; + + //--- Segment selector (combinational) -------------------------------- + uwire sign = xi[31]; + uwire [7:0] exp_bits = xi[30:23]; + uwire [K-1:0] sub = xi[22:23-K]; + + // Octave index: exp 125->0, 126->1, 127->2, 128->3, 129->4 + uwire [2:0] octave = exp_bits - 8'd125; + + // Classify + uwire is_near_zero = (exp_bits < 8'd125); + uwire is_pos_clamp = !sign && (exp_bits >= EXP_CLAMP); + uwire is_neg_clamp = sign && (exp_bits >= EXP_CLAMP); + + // Segment index for ROM lookup + uwire [6:0] seg_idx; + if(1) begin : blk_seg_idx + uwire [6:0] pos_idx = 7'd1 + {1'b0, octave, sub}; + uwire [6:0] neg_idx = 7'(7'd1 + NUM_SUBS * NUM_OCTAVES) + {1'b0, octave, sub}; + assign seg_idx = is_near_zero? 7'd0 : + sign? neg_idx : pos_idx; + end : blk_seg_idx + + //--- Horner chain: DEGREE stages of pwpolyf_dspfp32 ------------------ + // Stage 0: s[0] = coeff[DEGREE-1] + coeff[DEGREE] * x + // Stage j: s[j] = coeff[DEGREE-1-j] + s[j-1] * x_delayed + // Total: DEGREE * DSP_LAT cycles + + // Valid pipeline + logic [LATENCY-1:0] Vld = '0; + always_ff @(posedge clk) begin + if(rst) Vld <= '0; + else Vld <= { Vld[$left(Vld)-1:0], take }; + end + assign rvld_vec[pe] = Vld[$left(Vld)]; + + // Delay x for DSP B inputs and pass-through clamp + logic [31:0] XDly[LATENCY] = '{default: 'x}; + always_ff @(posedge clk) begin + XDly[0] <= xi; + for(int i = 1; i < LATENCY; i++) + XDly[i] <= XDly[i-1]; + end + + // DSP chain + uwire [31:0] s[DEGREE]; + + for(genvar j = 0; j < DEGREE; j++) begin : genDSP + uwire [31:0] dsp_a = (j == 0)? CFG.coeffs[seg_idx][DEGREE] : s[j-1]; + uwire [31:0] dsp_b = (j == 0)? xi : XDly[j*DSP_LAT - 1]; + + // C input: coeff[DEGREE-1-j] delayed by j*DSP_LAT cycles + logic [31:0] dsp_c; + if(j == 0) begin : genCdir + assign dsp_c = CFG.coeffs[seg_idx][DEGREE-1]; + end : genCdir + else begin : genCdly + logic [31:0] CDly[j*DSP_LAT] = '{default: 'x}; + always_ff @(posedge clk) begin + CDly[0] <= CFG.coeffs[seg_idx][DEGREE-1-j]; + for(int i = 1; i < j*DSP_LAT; i++) + CDly[i] <= CDly[i-1]; + end + assign dsp_c = CDly[j*DSP_LAT - 1]; + end : genCdly + + pwpolyf_dspfp32 dsp ( + .clk, .rst, + .a(dsp_a), .b(dsp_b), .c(dsp_c), + .r(s[j]), .rvld(Vld[(j+1)*DSP_LAT - 1]) + ); + end : genDSP + + //--- Clamp mux ------------------------------------------------------- + logic [LATENCY-1:0] NegClamp = '0; + logic [LATENCY-1:0] PosClamp = '0; + always_ff @(posedge clk) begin + if(rst) begin + NegClamp <= '0; + PosClamp <= '0; + end + else begin + NegClamp <= { NegClamp[$left(NegClamp)-1:0], is_neg_clamp }; + PosClamp <= { PosClamp[$left(PosClamp)-1:0], is_pos_clamp }; + end + end + + // Output mux + assign r[pe] = NegClamp[$left(NegClamp)]? CFG.neg_clamp : + PosClamp[$left(PosClamp)]? (CFG.pos_passthrough? XDly[LATENCY-1] : CFG.pos_clamp) : + s[DEGREE-1]; + + end : gen_pe + + // All PE results should be valid simultaneously + assign rvld = rvld_vec[0]; + always_ff @(posedge clk) begin + assert(rvld_vec == {(PE){rvld}}) else begin + $error("%m: Inconsistent output valid indications."); + $stop; + end + end + + //=== Credit-backing Elastic Output Queue ================================= + uwire rrdy; + queue #(.DATA_WIDTH($bits(fp_vec_t)), .ELASTICITY(CREDIT)) obuf ( + .clk, .rst, + .idat(r), .ivld(rvld), .irdy(rrdy), + .odat(ydat), .ovld(yvld), .ordy(yrdy) + ); + always_ff @(posedge clk) begin + assert(rrdy || !rvld) else begin + $error("%m: Result queue overrun."); + $stop; + end + end + +endmodule : pwpolyf diff --git a/finn-rtllib/pwpolyf/hdl/pwpolyf_pkg.sv b/finn-rtllib/pwpolyf/hdl/pwpolyf_pkg.sv new file mode 100644 index 0000000000..cdf479355e --- /dev/null +++ b/finn-rtllib/pwpolyf/hdl/pwpolyf_pkg.sv @@ -0,0 +1,393 @@ +/**************************************************************************** + * Copyright Advanced Micro Devices, Inc. + * SPDX-License-Identifier: BSD-3-Clause + * + * @brief Per-activation configuration for pwpolyf. + * @author Shane Fleming + * + * @description + * Package consolidating shared approximation constants and per-activation + * configuration (clamping parameters + coefficient arrays) for the + * piecewise polynomial activation unit. + * + * Coefficient data auto-generated by pwpolyf_rtl.py -- DEGREE=2 K=3 + * NUM_OCTAVES=5. Segments: 81 Coefficients per segment: 3 + * Polynomial: y = a_0 + a_1*x + a_2*x^2 + ... + a_d*x^d + * Horner form: y = a_0 + x*(a_1 + x*(a_2 + ... x*a_d)) + * + * Segment index encoding: + * 0 = near-zero (|x| < 0.25) + * 1 .. 5*2^K = positive octaves (exp 125..129) + * 5*2^K+1 .. end = negative octaves (exp 125..129) + ***************************************************************************/ +package pwpolyf_pkg; + + localparam int unsigned DEGREE = 2; + localparam int unsigned K = 3; + localparam int unsigned NUM_OCTAVES = 5; + localparam int unsigned NUM_SEGS = 81; + + typedef struct { + int unsigned neg_clamp; + int unsigned pos_clamp; + bit pos_passthrough; + int unsigned coeffs[NUM_SEGS][DEGREE+1]; + } func_cfg_t; + + localparam func_cfg_t GELU = '{ + neg_clamp: 32'h00000000, + pos_clamp: 32'h00000000, + pos_passthrough: 1, + coeffs: '{ + '{ 32'h37B92D98, 32'h3F000000, 32'h3ECA7276 }, // [0] near_zero: [-0.2500, 0.2500) + '{ 32'hBA7AD2E9, 32'h3F0278B4, 32'h3EBE374D }, // [1] pos_oct0_sub0: [0.2500, 0.2812) + '{ 32'hBAC20AB3, 32'h3F036C6B, 32'h3EBAD579 }, // [2] pos_oct0_sub1: [0.2812, 0.3125) + '{ 32'hBB0F6AA9, 32'h3F04950F, 32'h3EB720D6 }, // [3] pos_oct0_sub2: [0.3125, 0.3438) + '{ 32'hBB4C32E1, 32'h3F05F66C, 32'h3EB31D68 }, // [4] pos_oct0_sub3: [0.3438, 0.3750) + '{ 32'hBB8CE121, 32'h3F0793CF, 32'h3EAECF81 }, // [5] pos_oct0_sub4: [0.3750, 0.4062) + '{ 32'hBBBD44F5, 32'h3F097002, 32'h3EAA3BB6 }, // [6] pos_oct0_sub5: [0.4062, 0.4375) + '{ 32'hBBF87F2B, 32'h3F0B8D46, 32'h3EA566DF }, // [7] pos_oct0_sub6: [0.4375, 0.4688) + '{ 32'hBC1FE374, 32'h3F0DED4E, 32'h3EA05607 }, // [8] pos_oct0_sub7: [0.4688, 0.5000) + '{ 32'hBC6196C5, 32'h3F11FAE7, 32'h3E985541 }, // [9] pos_oct1_sub0: [0.5000, 0.5625) + '{ 32'hBCA9FA9B, 32'h3F185408, 32'h3E8D0D86 }, // [10] pos_oct1_sub1: [0.5625, 0.6250) + '{ 32'hBCF402A2, 32'h3F1FBA0A, 32'h3E813805 }, // [11] pos_oct1_sub2: [0.6250, 0.6875) + '{ 32'hBD28346B, 32'h3F281F7A, 32'h3E6A0426 }, // [12] pos_oct1_sub3: [0.6875, 0.7500) + '{ 32'hBD6013B5, 32'h3F316EAB, 32'h3E513158 }, // [13] pos_oct1_sub4: [0.7500, 0.8125) + '{ 32'hBD90E5E4, 32'h3F3B8A8B, 32'h3E384F09 }, // [14] pos_oct1_sub5: [0.8125, 0.8750) + '{ 32'hBDB698C5, 32'h3F464FB9, 32'h3E1FB052 }, // [15] pos_oct1_sub6: [0.8750, 0.9375) + '{ 32'hBDE0DF89, 32'h3F5195CD, 32'h3E07A270 }, // [16] pos_oct1_sub7: [0.9375, 1.0000) + '{ 32'hBE13A45A, 32'h3F62F993, 32'h3DCA8921 }, // [17] pos_oct2_sub0: [1.0000, 1.1250) + '{ 32'hBE483CB7, 32'h3F7A5DFA, 32'h3D6E8A6E }, // [18] pos_oct2_sub1: [1.1250, 1.2500) + '{ 32'hBE7FAE79, 32'h3F8848C8, 32'h3CC080D0 }, // [19] pos_oct2_sub2: [1.2500, 1.3750) + '{ 32'hBE9AF2A1, 32'h3F9227C4, 32'hBB9670CC }, // [20] pos_oct2_sub3: [1.3750, 1.5000) + '{ 32'hBEB35AF6, 32'h3F9A4E45, 32'hBCD3D62F }, // [21] pos_oct2_sub4: [1.5000, 1.6250) + '{ 32'hBEC733B2, 32'h3FA06D28, 32'hBD26553D }, // [22] pos_oct2_sub5: [1.6250, 1.7500) + '{ 32'hBED511F1, 32'h3FA466BA, 32'hBD4ACBAB }, // [23] pos_oct2_sub6: [1.7500, 1.8750) + '{ 32'hBEDC2153, 32'h3FA64B63, 32'hBD5B0B08 }, // [24] pos_oct2_sub7: [1.8750, 2.0000) + '{ 32'hBED97943, 32'h3FA5A9DF, 32'hBD5641AE }, // [25] pos_oct3_sub0: [2.0000, 2.2500) + '{ 32'hBEC1A84E, 32'h3FA066D7, 32'hBD310ADA }, // [26] pos_oct3_sub1: [2.2500, 2.5000) + '{ 32'hBE9AF232, 32'h3F98AA14, 32'hBCFF13CB }, // [27] pos_oct3_sub2: [2.5000, 2.7500) + '{ 32'hBE609A95, 32'h3F90E6E9, 32'hBCA49F87 }, // [28] pos_oct3_sub3: [2.7500, 3.0000) + '{ 32'hBE145DCE, 32'h3F8A8925, 32'hBC411EC8 }, // [29] pos_oct3_sub4: [3.0000, 3.2500) + '{ 32'hBDB37CB4, 32'h3F8603AD, 32'hBBCFAA27 }, // [30] pos_oct3_sub5: [3.2500, 3.5000) + '{ 32'hBD47900C, 32'h3F8328BC, 32'hBB4DD2CD }, // [31] pos_oct3_sub6: [3.5000, 3.7500) + '{ 32'hBCCC8719, 32'h3F818839, 32'hBABCBE73 }, // [32] pos_oct3_sub7: [3.7500, 4.0000) + '{ 32'hBC075296, 32'h3F8077C5, 32'hB9D4797E }, // [33] pos_oct4_sub0: [4.0000, 4.5000) + '{ 32'hBAB5A3CC, 32'h3F801236, 32'hB869FE3F }, // [34] pos_oct4_sub1: [4.5000, 5.0000) + '{ 32'hB935F6EF, 32'h3F800215, 32'hB6C346B7 }, // [35] pos_oct4_sub2: [5.0000, 5.5000) + '{ 32'hB78928DE, 32'h3F80002E, 32'hB4F89CA8 }, // [36] pos_oct4_sub3: [5.5000, 6.0000) + '{ 32'hB59C87B9, 32'h3F800003, 32'hB2F2851B }, // [37] pos_oct4_sub4: [6.0000, 6.5000) + '{ 32'hB387DBFC, 32'h3F800000, 32'hB0B5DB02 }, // [38] pos_oct4_sub5: [6.5000, 7.0000) + '{ 32'hB133EE82, 32'h3F800000, 32'hAE520F13 }, // [39] pos_oct4_sub6: [7.0000, 7.5000) + '{ 32'hAEB983D5, 32'h3F800000, 32'hABBEA652 }, // [40] pos_oct4_sub7: [7.5000, 8.0000) + '{ 32'hBA7AD2E9, 32'h3EFB0E98, 32'h3EBE374D }, // [41] neg_oct0_sub0: [-0.2812, -0.2500) + '{ 32'hBAC20AB3, 32'h3EF9272A, 32'h3EBAD579 }, // [42] neg_oct0_sub1: [-0.3125, -0.2812) + '{ 32'hBB0F6AA9, 32'h3EF6D5E1, 32'h3EB720D6 }, // [43] neg_oct0_sub2: [-0.3438, -0.3125) + '{ 32'hBB4C32E1, 32'h3EF41328, 32'h3EB31D68 }, // [44] neg_oct0_sub3: [-0.3750, -0.3438) + '{ 32'hBB8CE121, 32'h3EF0D863, 32'h3EAECF81 }, // [45] neg_oct0_sub4: [-0.4062, -0.3750) + '{ 32'hBBBD44F5, 32'h3EED1FFC, 32'h3EAA3BB6 }, // [46] neg_oct0_sub5: [-0.4375, -0.4062) + '{ 32'hBBF87F2B, 32'h3EE8E573, 32'h3EA566DF }, // [47] neg_oct0_sub6: [-0.4688, -0.4375) + '{ 32'hBC1FE374, 32'h3EE42563, 32'h3EA05607 }, // [48] neg_oct0_sub7: [-0.5000, -0.4688) + '{ 32'hBC6196C5, 32'h3EDC0A33, 32'h3E985541 }, // [49] neg_oct1_sub0: [-0.5625, -0.5000) + '{ 32'hBCA9FA9B, 32'h3ECF57F0, 32'h3E8D0D86 }, // [50] neg_oct1_sub1: [-0.6250, -0.5625) + '{ 32'hBCF402A2, 32'h3EC08BED, 32'h3E813805 }, // [51] neg_oct1_sub2: [-0.6875, -0.6250) + '{ 32'hBD28346B, 32'h3EAFC10C, 32'h3E6A0426 }, // [52] neg_oct1_sub3: [-0.7500, -0.6875) + '{ 32'hBD6013B5, 32'h3E9D22AA, 32'h3E513158 }, // [53] neg_oct1_sub4: [-0.8125, -0.7500) + '{ 32'hBD90E5E4, 32'h3E88EAEA, 32'h3E384F09 }, // [54] neg_oct1_sub5: [-0.8750, -0.8125) + '{ 32'hBDB698C5, 32'h3E66C11A, 32'h3E1FB052 }, // [55] neg_oct1_sub6: [-0.9375, -0.8750) + '{ 32'hBDE0DF89, 32'h3E39A8CD, 32'h3E07A270 }, // [56] neg_oct1_sub7: [-1.0000, -0.9375) + '{ 32'hBE13A45A, 32'h3DE83369, 32'h3DCA8921 }, // [57] neg_oct2_sub0: [-1.1250, -1.0000) + '{ 32'hBE483CB7, 32'h3CB440CF, 32'h3D6E8A6E }, // [58] neg_oct2_sub1: [-1.2500, -1.1250) + '{ 32'hBE7FAE79, 32'hBD848C82, 32'h3CC080D0 }, // [59] neg_oct2_sub2: [-1.3750, -1.2500) + '{ 32'hBE9AF2A1, 32'hBE113E1E, 32'hBB9670CC }, // [60] neg_oct2_sub3: [-1.5000, -1.3750) + '{ 32'hBEB35AF6, 32'hBE52722A, 32'hBCD3D62F }, // [61] neg_oct2_sub4: [-1.6250, -1.5000) + '{ 32'hBEC733B2, 32'hBE81B4A1, 32'hBD26553D }, // [62] neg_oct2_sub5: [-1.7500, -1.6250) + '{ 32'hBED511F1, 32'hBE919AE6, 32'hBD4ACBAB }, // [63] neg_oct2_sub6: [-1.8750, -1.7500) + '{ 32'hBEDC2153, 32'hBE992D8A, 32'hBD5B0B08 }, // [64] neg_oct2_sub7: [-2.0000, -1.8750) + '{ 32'hBED97943, 32'hBE96A77B, 32'hBD5641AE }, // [65] neg_oct3_sub0: [-2.2500, -2.0000) + '{ 32'hBEC1A84E, 32'hBE819B5D, 32'hBD310ADA }, // [66] neg_oct3_sub1: [-2.5000, -2.2500) + '{ 32'hBE9AF232, 32'hBE4550A3, 32'hBCFF13CB }, // [67] neg_oct3_sub2: [-2.7500, -2.5000) + '{ 32'hBE609A95, 32'hBE073747, 32'hBCA49F87 }, // [68] neg_oct3_sub3: [-3.0000, -2.7500) + '{ 32'hBE145DCE, 32'hBDA8924A, 32'hBC411EC8 }, // [69] neg_oct3_sub4: [-3.2500, -3.0000) + '{ 32'hBDB37CB4, 32'hBD4075A8, 32'hBBCFAA27 }, // [70] neg_oct3_sub5: [-3.5000, -3.2500) + '{ 32'hBD47900C, 32'hBCCA2F0E, 32'hBB4DD2CD }, // [71] neg_oct3_sub6: [-3.7500, -3.5000) + '{ 32'hBCCC8719, 32'hBC441CB8, 32'hBABCBE73 }, // [72] neg_oct3_sub7: [-4.0000, -3.7500) + '{ 32'hBC075296, 32'hBB6F8A8D, 32'hB9D4797E }, // [73] neg_oct4_sub0: [-4.5000, -4.0000) + '{ 32'hBAB5A3CC, 32'hBA11AE73, 32'hB869FE3F }, // [74] neg_oct4_sub1: [-5.0000, -4.5000) + '{ 32'hB935F6EF, 32'hB8853C9B, 32'hB6C346B7 }, // [75] neg_oct4_sub2: [-5.5000, -5.0000) + '{ 32'hB78928DF, 32'hB6B89CD1, 32'hB4F89CA9 }, // [76] neg_oct4_sub3: [-6.0000, -5.5000) + '{ 32'hB59C87B7, 32'hB4C2CE38, 32'hB2F28516 }, // [77] neg_oct4_sub4: [-6.5000, -6.0000) + '{ 32'hB387DC68, 32'hB29D2B70, 32'hB0B5DB95 }, // [78] neg_oct4_sub5: [-7.0000, -6.5000) + '{ 32'hB1340659, 32'hB0427F9C, 32'hAE52296C }, // [79] neg_oct4_sub6: [-7.5000, -7.0000) + '{ 32'hAEB6A669, 32'hADB910C4, 32'hABBB86A1 } // [80] neg_oct4_sub7: [-8.0000, -7.5000) + } + }; + + localparam func_cfg_t SILU = '{ + neg_clamp: 32'h00000000, + pos_clamp: 32'h00000000, + pos_passthrough: 1, + coeffs: '{ + '{ 32'h36E8E4D8, 32'h3F000000, 32'h3E7EDCA9 }, // [0] near_zero: [-0.2500, 0.2500) + '{ 32'hB99F0E43, 32'h3F00C85E, 32'h3E771F38 }, // [1] pos_oct0_sub0: [0.2500, 0.2812) + '{ 32'hB9F6D43F, 32'h3F01164A, 32'h3E74F597 }, // [2] pos_oct0_sub1: [0.2812, 0.3125) + '{ 32'hBA370988, 32'h3F017596, 32'h3E729418 }, // [3] pos_oct0_sub2: [0.3125, 0.3438) + '{ 32'hBA82C874, 32'h3F01E7B8, 32'h3E6FFC71 }, // [4] pos_oct0_sub3: [0.3438, 0.3750) + '{ 32'hBAB52EA2, 32'h3F026E06, 32'h3E6D3079 }, // [5] pos_oct0_sub4: [0.3750, 0.4062) + '{ 32'hBAF47A83, 32'h3F0309BD, 32'h3E6A3227 }, // [6] pos_oct0_sub5: [0.4062, 0.4375) + '{ 32'hBB213FE4, 32'h3F03BBFC, 32'h3E670392 }, // [7] pos_oct0_sub6: [0.4375, 0.4688) + '{ 32'hBB508F23, 32'h3F0485C2, 32'h3E63A6E9 }, // [8] pos_oct0_sub7: [0.4688, 0.5000) + '{ 32'hBB945DC5, 32'h3F05E1DA, 32'h3E5E4861 }, // [9] pos_oct1_sub0: [0.5000, 0.5625) + '{ 32'hBBE25EA1, 32'h3F080BF4, 32'h3E569783 }, // [10] pos_oct1_sub1: [0.5625, 0.6250) + '{ 32'hBC24C1FC, 32'h3F0A9F8F, 32'h3E4E59C7 }, // [11] pos_oct1_sub2: [0.6250, 0.6875) + '{ 32'hBC66B247, 32'h3F0D9E5F, 32'h3E45A3CF }, // [12] pos_oct1_sub3: [0.6875, 0.7500) + '{ 32'hBC9C5235, 32'h3F11080C, 32'h3E3C8A91 }, // [13] pos_oct1_sub4: [0.7500, 0.8125) + '{ 32'hBCCE03BB, 32'h3F14DA46, 32'h3E332307 }, // [14] pos_oct1_sub5: [0.8125, 0.8750) + '{ 32'hBD048260, 32'h3F1910E8, 32'h3E2981D7 }, // [15] pos_oct1_sub6: [0.8750, 0.9375) + '{ 32'hBD26E393, 32'h3F1DA629, 32'h3E1FBB0A }, // [16] pos_oct1_sub7: [0.9375, 1.0000) + '{ 32'hBD637E84, 32'h3F252120, 32'h3E10F470 }, // [17] pos_oct2_sub0: [1.0000, 1.1250) + '{ 32'hBDA33C0B, 32'h3F301FB8, 32'h3DFACFC9 }, // [18] pos_oct2_sub1: [1.1250, 1.2500) + '{ 32'hBDDE6AD0, 32'h3F3BF5DC, 32'h3DD4EC26 }, // [19] pos_oct2_sub2: [1.2500, 1.3750) + '{ 32'hBE112138, 32'h3F484C26, 32'h3DB1044F }, // [20] pos_oct2_sub3: [1.3750, 1.5000) + '{ 32'hBE369C04, 32'h3F54CB60, 32'h3D8FAC7F }, // [21] pos_oct2_sub4: [1.5000, 1.6250) + '{ 32'hBE5EB18B, 32'h3F612208, 32'h3D629168 }, // [22] pos_oct2_sub5: [1.6250, 1.7500) + '{ 32'hBE842940, 32'h3F6D0854, 32'h3D2C2276 }, // [23] pos_oct2_sub6: [1.7500, 1.8750) + '{ 32'hBE9933D8, 32'h3F7842BB, 32'h3CF863D2 }, // [24] pos_oct2_sub7: [1.8750, 2.0000) + '{ 32'hBEB7B0C1, 32'h3F83AB39, 32'h3C8110DA }, // [25] pos_oct3_sub0: [2.0000, 2.2500) + '{ 32'hBEDD223E, 32'h3F8C0325, 32'h3AA08170 }, // [26] pos_oct3_sub1: [2.2500, 2.5000) + '{ 32'hBEFBE17D, 32'h3F922E48, 32'hBC0A6454 }, // [27] pos_oct3_sub2: [2.5000, 2.7500) + '{ 32'hBF093230, 32'h3F9649E6, 32'hBC6A5E21 }, // [28] pos_oct3_sub3: [2.7500, 3.0000) + '{ 32'hBF101F60, 32'h3F989BC7, 32'hBC8E0D50 }, // [29] pos_oct3_sub4: [3.0000, 3.2500) + '{ 32'hBF12EF0A, 32'h3F997B2D, 32'hBC96B8B5 }, // [30] pos_oct3_sub5: [3.2500, 3.5000) + '{ 32'hBF121E34, 32'h3F9940CD, 32'hBC94AEB1 }, // [31] pos_oct3_sub6: [3.5000, 3.7500) + '{ 32'hBF0E4E33, 32'h3F983D51, 32'hBC8C0ED6 }, // [32] pos_oct3_sub7: [3.7500, 4.0000) + '{ 32'hBF04791A, 32'h3F95D0ED, 32'hBC71E16F }, // [33] pos_oct4_sub0: [4.0000, 4.5000) + '{ 32'hBEE59935, 32'h3F91E3AD, 32'hBC3A058C }, // [34] pos_oct4_sub1: [4.5000, 5.0000) + '{ 32'hBEBEB440, 32'h3F8DFEE7, 32'hBC081CC1 }, // [35] pos_oct4_sub2: [5.0000, 5.5000) + '{ 32'hBE9948D3, 32'h3F8A96AC, 32'hBBC0C8D1 }, // [36] pos_oct4_sub3: [5.5000, 6.0000) + '{ 32'hBE70101E, 32'h3F87D004, 32'hBB8571D8 }, // [37] pos_oct4_sub4: [6.0000, 6.5000) + '{ 32'hBE3805CC, 32'h3F85A739, 32'hBB35ABD2 }, // [38] pos_oct4_sub5: [6.5000, 7.0000) + '{ 32'hBE0A951B, 32'h3F840701, 32'hBAF42F39 }, // [39] pos_oct4_sub6: [7.0000, 7.5000) + '{ 32'hBDCD9ACB, 32'h3F82D521, 32'hBAA27541 }, // [40] pos_oct4_sub7: [7.5000, 8.0000) + '{ 32'hB99F0E43, 32'h3EFE6F44, 32'h3E771F38 }, // [41] neg_oct0_sub0: [-0.2812, -0.2500) + '{ 32'hB9F6D43F, 32'h3EFDD36C, 32'h3E74F597 }, // [42] neg_oct0_sub1: [-0.3125, -0.2812) + '{ 32'hBA370988, 32'h3EFD14D3, 32'h3E729418 }, // [43] neg_oct0_sub2: [-0.3438, -0.3125) + '{ 32'hBA82C874, 32'h3EFC3091, 32'h3E6FFC71 }, // [44] neg_oct0_sub3: [-0.3750, -0.3438) + '{ 32'hBAB52EA2, 32'h3EFB23F5, 32'h3E6D3079 }, // [45] neg_oct0_sub4: [-0.4062, -0.3750) + '{ 32'hBAF47A83, 32'h3EF9EC85, 32'h3E6A3227 }, // [46] neg_oct0_sub5: [-0.4375, -0.4062) + '{ 32'hBB213FE4, 32'h3EF88807, 32'h3E670392 }, // [47] neg_oct0_sub6: [-0.4688, -0.4375) + '{ 32'hBB508F23, 32'h3EF6F47D, 32'h3E63A6E9 }, // [48] neg_oct0_sub7: [-0.5000, -0.4688) + '{ 32'hBB945DC5, 32'h3EF43C4C, 32'h3E5E4861 }, // [49] neg_oct1_sub0: [-0.5625, -0.5000) + '{ 32'hBBE25EA1, 32'h3EEFE817, 32'h3E569783 }, // [50] neg_oct1_sub1: [-0.6250, -0.5625) + '{ 32'hBC24C1FC, 32'h3EEAC0E2, 32'h3E4E59C7 }, // [51] neg_oct1_sub2: [-0.6875, -0.6250) + '{ 32'hBC66B247, 32'h3EE4C342, 32'h3E45A3CF }, // [52] neg_oct1_sub3: [-0.7500, -0.6875) + '{ 32'hBC9C5235, 32'h3EDDEFE8, 32'h3E3C8A91 }, // [53] neg_oct1_sub4: [-0.8125, -0.7500) + '{ 32'hBCCE03BB, 32'h3ED64B75, 32'h3E332307 }, // [54] neg_oct1_sub5: [-0.8750, -0.8125) + '{ 32'hBD048260, 32'h3ECDDE30, 32'h3E2981D7 }, // [55] neg_oct1_sub6: [-0.9375, -0.8750) + '{ 32'hBD26E393, 32'h3EC4B3AE, 32'h3E1FBB0A }, // [56] neg_oct1_sub7: [-1.0000, -0.9375) + '{ 32'hBD637E84, 32'h3EB5BDC0, 32'h3E10F470 }, // [57] neg_oct2_sub0: [-1.1250, -1.0000) + '{ 32'hBDA33C0B, 32'h3E9FC090, 32'h3DFACFC9 }, // [58] neg_oct2_sub1: [-1.2500, -1.1250) + '{ 32'hBDDE6AD0, 32'h3E881448, 32'h3DD4EC26 }, // [59] neg_oct2_sub2: [-1.3750, -1.2500) + '{ 32'hBE112138, 32'h3E5ECF69, 32'h3DB1044F }, // [60] neg_oct2_sub3: [-1.5000, -1.3750) + '{ 32'hBE369C04, 32'h3E2CD280, 32'h3D8FAC7F }, // [61] neg_oct2_sub4: [-1.6250, -1.5000) + '{ 32'hBE5EB18B, 32'h3DF6EFBF, 32'h3D629168 }, // [62] neg_oct2_sub5: [-1.7500, -1.6250) + '{ 32'hBE842940, 32'h3D97BD5C, 32'h3D2C2276 }, // [63] neg_oct2_sub6: [-1.8750, -1.7500) + '{ 32'hBE9933D8, 32'h3CF7A895, 32'h3CF863D2 }, // [64] neg_oct2_sub7: [-2.0000, -1.8750) + '{ 32'hBEB7B0C1, 32'hBCEACE5E, 32'h3C8110DA }, // [65] neg_oct3_sub0: [-2.2500, -2.0000) + '{ 32'hBEDD223E, 32'hBDC03253, 32'h3AA08170 }, // [66] neg_oct3_sub1: [-2.5000, -2.2500) + '{ 32'hBEFBE17D, 32'hBE11723E, 32'hBC0A6454 }, // [67] neg_oct3_sub2: [-2.7500, -2.5000) + '{ 32'hBF093230, 32'hBE324F32, 32'hBC6A5E21 }, // [68] neg_oct3_sub3: [-3.0000, -2.7500) + '{ 32'hBF101F60, 32'hBE44DE3B, 32'hBC8E0D50 }, // [69] neg_oct3_sub4: [-3.2500, -3.0000) + '{ 32'hBF12EF0A, 32'hBE4BD96A, 32'hBC96B8B5 }, // [70] neg_oct3_sub5: [-3.5000, -3.2500) + '{ 32'hBF121E34, 32'hBE4A0667, 32'hBC94AEB1 }, // [71] neg_oct3_sub6: [-3.7500, -3.5000) + '{ 32'hBF0E4E33, 32'hBE41EA8C, 32'hBC8C0ED6 }, // [72] neg_oct3_sub7: [-4.0000, -3.7500) + '{ 32'hBF04791A, 32'hBE2E8765, 32'hBC71E16F }, // [73] neg_oct4_sub0: [-4.5000, -4.0000) + '{ 32'hBEE59935, 32'hBE0F1D69, 32'hBC3A058C }, // [74] neg_oct4_sub1: [-5.0000, -4.5000) + '{ 32'hBEBEB440, 32'hBDDFEE68, 32'hBC081CC1 }, // [75] neg_oct4_sub2: [-5.5000, -5.0000) + '{ 32'hBE9948D3, 32'hBDA96ABE, 32'hBBC0C8D1 }, // [76] neg_oct4_sub3: [-6.0000, -5.5000) + '{ 32'hBE70101E, 32'hBD7A008A, 32'hBB8571D8 }, // [77] neg_oct4_sub4: [-6.5000, -6.0000) + '{ 32'hBE3805CC, 32'hBD34E717, 32'hBB35ABD2 }, // [78] neg_oct4_sub5: [-7.0000, -6.5000) + '{ 32'hBE0A951B, 32'hBD00E023, 32'hBAF42F39 }, // [79] neg_oct4_sub6: [-7.5000, -7.0000) + '{ 32'hBDCD9ACB, 32'hBCB54833, 32'hBAA27541 } // [80] neg_oct4_sub7: [-8.0000, -7.5000) + } + }; + + localparam func_cfg_t SIGMOID = '{ + neg_clamp: 32'h00000000, + pos_clamp: 32'h3F800000, + pos_passthrough: 0, + coeffs: '{ + '{ 32'h3F000000, 32'h3E7F33E9, 32'h00000000 }, // [0] near_zero: [-0.2500, 0.2500) + '{ 32'h3EFFCF0E, 32'h3E822D89, 32'hBC84D823 }, // [1] pos_oct0_sub0: [0.2500, 0.2812) + '{ 32'h3EFFBC5D, 32'h3E82B26C, 32'hBC939CA5 }, // [2] pos_oct0_sub1: [0.2812, 0.3125) + '{ 32'h3EFFA5B9, 32'h3E834349, 32'hBCA219C6 }, // [3] pos_oct0_sub2: [0.3125, 0.3438) + '{ 32'h3EFF8AE8, 32'h3E83DF4C, 32'hBCB04918 }, // [4] pos_oct0_sub3: [0.3438, 0.3750) + '{ 32'h3EFF6BBB, 32'h3E84858E, 32'hBCBE247C }, // [5] pos_oct0_sub4: [0.3750, 0.4062) + '{ 32'h3EFF4812, 32'h3E85351C, 32'hBCCBA623 }, // [6] pos_oct0_sub5: [0.4062, 0.4375) + '{ 32'h3EFF1FDA, 32'h3E85ECF7, 32'hBCD8C899 }, // [7] pos_oct0_sub6: [0.4375, 0.4688) + '{ 32'h3EFEF30F, 32'h3E86AC14, 32'hBCE586C5 }, // [8] pos_oct0_sub7: [0.4688, 0.5000) + '{ 32'h3EFEA7FD, 32'h3E87D4B2, 32'hBCF7D7C9 }, // [9] pos_oct1_sub0: [0.5000, 0.5625) + '{ 32'h3EFE340F, 32'h3E897100, 32'hBD0761C6 }, // [10] pos_oct1_sub1: [0.5625, 0.6250) + '{ 32'h3EFDB07F, 32'h3E8B1626, 32'hBD11EAFB }, // [11] pos_oct1_sub2: [0.6250, 0.6875) + '{ 32'h3EFD1FEF, 32'h3E8CBAE0, 32'hBD1B7C9A }, // [12] pos_oct1_sub3: [0.6875, 0.7500) + '{ 32'h3EFC85D6, 32'h3E8E5602, 32'hBD240EF3 }, // [13] pos_oct1_sub4: [0.7500, 0.8125) + '{ 32'h3EFBE66B, 32'h3E8FDEA9, 32'hBD2B9D89 }, // [14] pos_oct1_sub5: [0.8125, 0.8750) + '{ 32'h3EFB4684, 32'h3E914C67, 32'hBD3226EB }, // [15] pos_oct1_sub6: [0.8750, 0.9375) + '{ 32'h3EFAAB7A, 32'h3E929769, 32'hBD37AC88 }, // [16] pos_oct1_sub7: [0.9375, 1.0000) + '{ 32'h3EF9DB97, 32'h3E9432F4, 32'hBD3E0A59 }, // [17] pos_oct2_sub0: [1.0000, 1.1250) + '{ 32'h3EF90A04, 32'h3E95A99D, 32'hBD434674 }, // [18] pos_oct2_sub1: [1.1250, 1.2500) + '{ 32'h3EF8B5B0, 32'h3E963283, 32'hBD4502F9 }, // [19] pos_oct2_sub2: [1.2500, 1.3750) + '{ 32'h3EF90A01, 32'h3E95B9A5, 32'hBD43A873 }, // [20] pos_oct2_sub3: [1.3750, 1.5000) + '{ 32'h3EFA2B30, 32'h3E94399D, 32'hBD3FAC42 }, // [21] pos_oct2_sub4: [1.5000, 1.6250) + '{ 32'h3EFC3409, 32'h3E91B9DE, 32'hBD39885F }, // [22] pos_oct2_sub5: [1.6250, 1.7500) + '{ 32'h3EFF34CB, 32'h3E8E4C54, 32'hBD31B499 }, // [23] pos_oct2_sub6: [1.7500, 1.8750) + '{ 32'h3F019987, 32'h3E8A0AB5, 32'hBD28A16E }, // [24] pos_oct2_sub7: [1.8750, 2.0000) + '{ 32'h3F057A9E, 32'h3E8262A4, 32'hBD1983F9 }, // [25] pos_oct3_sub0: [2.0000, 2.2500) + '{ 32'h3F0C3068, 32'h3E6CECEE, 32'hBD0452A4 }, // [26] pos_oct3_sub1: [2.2500, 2.5000) + '{ 32'h3F1447FB, 32'h3E530738, 32'hBCDF3123 }, // [27] pos_oct3_sub2: [2.5000, 2.7500) + '{ 32'h3F1D4A57, 32'h3E38CEB4, 32'hBCB90648 }, // [28] pos_oct3_sub3: [2.7500, 3.0000) + '{ 32'h3F26C116, 32'h3E1F8DBB, 32'hBC975293 }, // [29] pos_oct3_sub4: [3.0000, 3.2500) + '{ 32'h3F30448D, 32'h3E081E0B, 32'hBC74E633 }, // [30] pos_oct3_sub5: [3.2500, 3.5000) + '{ 32'h3F398293, 32'h3DE5F2D2, 32'hBC4485E9 }, // [31] pos_oct3_sub6: [3.5000, 3.7500) + '{ 32'h3F423FDE, 32'h3DC0A0B4, 32'hBC1CAC71 }, // [32] pos_oct3_sub7: [3.7500, 4.0000) + '{ 32'h3F4DF46F, 32'h3D924A89, 32'hBBDD9C65 }, // [33] pos_oct4_sub0: [4.0000, 4.5000) + '{ 32'h3F5B2AC0, 32'h3D464E5F, 32'hBB897CEB }, // [34] pos_oct4_sub1: [4.5000, 5.0000) + '{ 32'h3F656FD2, 32'h3D045D55, 32'hBB291712 }, // [35] pos_oct4_sub2: [5.0000, 5.5000) + '{ 32'h3F6D25B4, 32'h3CAEBB3F, 32'hBACED637 }, // [36] pos_oct4_sub3: [5.5000, 6.0000) + '{ 32'h3F72CA7A, 32'h3C64B7EA, 32'hBA7C2E52 }, // [37] pos_oct4_sub4: [6.0000, 6.5000) + '{ 32'h3F76D7C3, 32'h3C14B63C, 32'hBA196D41 }, // [38] pos_oct4_sub5: [6.5000, 7.0000) + '{ 32'h3F79B550, 32'h3BC05DAC, 32'hB9BA76BE }, // [39] pos_oct4_sub6: [7.0000, 7.5000) + '{ 32'h3F7BB5C3, 32'h3B77C0C9, 32'hB96272C7 }, // [40] pos_oct4_sub7: [7.5000, 8.0000) + '{ 32'h3F001879, 32'h3E822D89, 32'h3C84D823 }, // [41] neg_oct0_sub0: [-0.2812, -0.2500) + '{ 32'h3F0021D2, 32'h3E82B26C, 32'h3C939CA5 }, // [42] neg_oct0_sub1: [-0.3125, -0.2812) + '{ 32'h3F002D23, 32'h3E834349, 32'h3CA219C6 }, // [43] neg_oct0_sub2: [-0.3438, -0.3125) + '{ 32'h3F003A8C, 32'h3E83DF4C, 32'h3CB04918 }, // [44] neg_oct0_sub3: [-0.3750, -0.3438) + '{ 32'h3F004A22, 32'h3E84858E, 32'h3CBE247C }, // [45] neg_oct0_sub4: [-0.4062, -0.3750) + '{ 32'h3F005BF7, 32'h3E85351C, 32'h3CCBA623 }, // [46] neg_oct0_sub5: [-0.4375, -0.4062) + '{ 32'h3F007013, 32'h3E85ECF7, 32'h3CD8C899 }, // [47] neg_oct0_sub6: [-0.4688, -0.4375) + '{ 32'h3F008678, 32'h3E86AC14, 32'h3CE586C5 }, // [48] neg_oct0_sub7: [-0.5000, -0.4688) + '{ 32'h3F00AC01, 32'h3E87D4B2, 32'h3CF7D7C9 }, // [49] neg_oct1_sub0: [-0.5625, -0.5000) + '{ 32'h3F00E5F9, 32'h3E897100, 32'h3D0761C6 }, // [50] neg_oct1_sub1: [-0.6250, -0.5625) + '{ 32'h3F0127C1, 32'h3E8B1626, 32'h3D11EAFB }, // [51] neg_oct1_sub2: [-0.6875, -0.6250) + '{ 32'h3F017009, 32'h3E8CBAE0, 32'h3D1B7C9A }, // [52] neg_oct1_sub3: [-0.7500, -0.6875) + '{ 32'h3F01BD15, 32'h3E8E5602, 32'h3D240EF3 }, // [53] neg_oct1_sub4: [-0.8125, -0.7500) + '{ 32'h3F020CCB, 32'h3E8FDEA9, 32'h3D2B9D89 }, // [54] neg_oct1_sub5: [-0.8750, -0.8125) + '{ 32'h3F025CBE, 32'h3E914C67, 32'h3D3226EB }, // [55] neg_oct1_sub6: [-0.9375, -0.8750) + '{ 32'h3F02AA43, 32'h3E929769, 32'h3D37AC88 }, // [56] neg_oct1_sub7: [-1.0000, -0.9375) + '{ 32'h3F031234, 32'h3E9432F4, 32'h3D3E0A59 }, // [57] neg_oct2_sub0: [-1.1250, -1.0000) + '{ 32'h3F037AFE, 32'h3E95A99D, 32'h3D434674 }, // [58] neg_oct2_sub1: [-1.2500, -1.1250) + '{ 32'h3F03A528, 32'h3E963283, 32'h3D4502F9 }, // [59] neg_oct2_sub2: [-1.3750, -1.2500) + '{ 32'h3F037AFF, 32'h3E95B9A5, 32'h3D43A873 }, // [60] neg_oct2_sub3: [-1.5000, -1.3750) + '{ 32'h3F02EA68, 32'h3E94399D, 32'h3D3FAC42 }, // [61] neg_oct2_sub4: [-1.6250, -1.5000) + '{ 32'h3F01E5FC, 32'h3E91B9DE, 32'h3D39885F }, // [62] neg_oct2_sub5: [-1.7500, -1.6250) + '{ 32'h3F00659B, 32'h3E8E4C54, 32'h3D31B499 }, // [63] neg_oct2_sub6: [-1.8750, -1.7500) + '{ 32'h3EFCCCF2, 32'h3E8A0AB5, 32'h3D28A16E }, // [64] neg_oct2_sub7: [-2.0000, -1.8750) + '{ 32'h3EF50AC5, 32'h3E8262A4, 32'h3D1983F9 }, // [65] neg_oct3_sub0: [-2.2500, -2.0000) + '{ 32'h3EE79F30, 32'h3E6CECEE, 32'h3D0452A4 }, // [66] neg_oct3_sub1: [-2.5000, -2.2500) + '{ 32'h3ED7700A, 32'h3E530738, 32'h3CDF3123 }, // [67] neg_oct3_sub2: [-2.7500, -2.5000) + '{ 32'h3EC56B51, 32'h3E38CEB4, 32'h3CB90648 }, // [68] neg_oct3_sub3: [-3.0000, -2.7500) + '{ 32'h3EB27DD3, 32'h3E1F8DBB, 32'h3C975293 }, // [69] neg_oct3_sub4: [-3.2500, -3.0000) + '{ 32'h3E9F76E6, 32'h3E081E0B, 32'h3C74E633 }, // [70] neg_oct3_sub5: [-3.5000, -3.2500) + '{ 32'h3E8CFAD9, 32'h3DE5F2D2, 32'h3C4485E9 }, // [71] neg_oct3_sub6: [-3.7500, -3.5000) + '{ 32'h3E770088, 32'h3DC0A0B4, 32'h3C1CAC71 }, // [72] neg_oct3_sub7: [-4.0000, -3.7500) + '{ 32'h3E482E45, 32'h3D924A89, 32'h3BDD9C65 }, // [73] neg_oct4_sub0: [-4.5000, -4.0000) + '{ 32'h3E1354FF, 32'h3D464E5F, 32'h3B897CEB }, // [74] neg_oct4_sub1: [-5.0000, -4.5000) + '{ 32'h3DD48171, 32'h3D045D55, 32'h3B291712 }, // [75] neg_oct4_sub2: [-5.5000, -5.0000) + '{ 32'h3D96D261, 32'h3CAEBB3F, 32'h3ACED637 }, // [76] neg_oct4_sub3: [-6.0000, -5.5000) + '{ 32'h3D535864, 32'h3C64B7EA, 32'h3A7C2E52 }, // [77] neg_oct4_sub4: [-6.5000, -6.0000) + '{ 32'h3D1283CA, 32'h3C14B63C, 32'h3A196D41 }, // [78] neg_oct4_sub5: [-7.0000, -6.5000) + '{ 32'h3CC95606, 32'h3BC05DAC, 32'h39BA76BE }, // [79] neg_oct4_sub6: [-7.5000, -7.0000) + '{ 32'h3C894795, 32'h3B77C0C9, 32'h396272C7 } // [80] neg_oct4_sub7: [-8.0000, -7.5000) + } + }; + + localparam func_cfg_t TANH = '{ + neg_clamp: 32'hBF800000, + pos_clamp: 32'h3F800000, + pos_passthrough: 0, + coeffs: '{ + '{ 32'hA1B504F3, 32'h3F7CDA60, 32'h00000000 }, // [0] near_zero: [-0.2500, 0.2500) + '{ 32'hBBAC0178, 32'h3F87D4B2, 32'hBE77D7C9 }, // [1] pos_oct0_sub0: [0.2500, 0.2812) + '{ 32'hBBE5F89C, 32'h3F897100, 32'hBE8761C6 }, // [2] pos_oct0_sub1: [0.2812, 0.3125) + '{ 32'hBC13E055, 32'h3F8B1626, 32'hBE91EAFB }, // [3] pos_oct0_sub2: [0.3125, 0.3438) + '{ 32'hBC38044C, 32'h3F8CBAE0, 32'hBE9B7C9A }, // [4] pos_oct0_sub3: [0.3438, 0.3750) + '{ 32'hBC5E8A68, 32'h3F8E5602, 32'hBEA40EF3 }, // [5] pos_oct0_sub4: [0.3750, 0.4062) + '{ 32'hBC8332A5, 32'h3F8FDEA9, 32'hBEAB9D89 }, // [6] pos_oct0_sub5: [0.4062, 0.4375) + '{ 32'hBC972F8C, 32'h3F914C67, 32'hBEB226EB }, // [7] pos_oct0_sub6: [0.4375, 0.4688) + '{ 32'hBCAA90CC, 32'h3F929769, 32'hBEB7AC88 }, // [8] pos_oct0_sub7: [0.4688, 0.5000) + '{ 32'hBCC48D18, 32'h3F9432F4, 32'hBEBE0A59 }, // [9] pos_oct1_sub0: [0.5000, 0.5625) + '{ 32'hBCDEBF75, 32'h3F95A99D, 32'hBEC34674 }, // [10] pos_oct1_sub1: [0.5625, 0.6250) + '{ 32'hBCE94A05, 32'h3F963283, 32'hBEC502F9 }, // [11] pos_oct1_sub2: [0.6250, 0.6875) + '{ 32'hBCDEBFD9, 32'h3F95B9A5, 32'hBEC3A873 }, // [12] pos_oct1_sub3: [0.6875, 0.7500) + '{ 32'hBCBA99F8, 32'h3F94399D, 32'hBEBFAC42 }, // [13] pos_oct1_sub4: [0.7500, 0.8125) + '{ 32'hBC72FDCF, 32'h3F91B9DE, 32'hBEB9885F }, // [14] pos_oct1_sub5: [0.8125, 0.8750) + '{ 32'hBB4B3567, 32'h3F8E4C54, 32'hBEB1B499 }, // [15] pos_oct1_sub6: [0.8750, 0.9375) + '{ 32'h3C4CC375, 32'h3F8A0AB5, 32'hBEA8A16E }, // [16] pos_oct1_sub7: [0.9375, 1.0000) + '{ 32'h3D2F53B5, 32'h3F8262A4, 32'hBE9983F9 }, // [17] pos_oct2_sub0: [1.0000, 1.1250) + '{ 32'h3DC3067D, 32'h3F6CECEE, 32'hBE8452A4 }, // [18] pos_oct2_sub1: [1.1250, 1.2500) + '{ 32'h3E223FD6, 32'h3F530738, 32'hBE5F3123 }, // [19] pos_oct2_sub2: [1.2500, 1.3750) + '{ 32'h3E6A52BB, 32'h3F38CEB4, 32'hBE390648 }, // [20] pos_oct2_sub3: [1.3750, 1.5000) + '{ 32'h3E9B0459, 32'h3F1F8DBB, 32'hBE175293 }, // [21] pos_oct2_sub4: [1.5000, 1.6250) + '{ 32'h3EC11234, 32'h3F081E0B, 32'hBDF4E633 }, // [22] pos_oct2_sub5: [1.6250, 1.7500) + '{ 32'h3EE60A4E, 32'h3EE5F2D2, 32'hBDC485E9 }, // [23] pos_oct2_sub6: [1.7500, 1.8750) + '{ 32'h3F047FBC, 32'h3EC0A0B4, 32'hBD9CAC71 }, // [24] pos_oct2_sub7: [1.8750, 2.0000) + '{ 32'h3F1BE8DE, 32'h3E924A89, 32'hBD5D9C65 }, // [25] pos_oct3_sub0: [2.0000, 2.2500) + '{ 32'h3F365580, 32'h3E464E5F, 32'hBD097CEB }, // [26] pos_oct3_sub1: [2.2500, 2.5000) + '{ 32'h3F4ADFA4, 32'h3E045D55, 32'hBCA91712 }, // [27] pos_oct3_sub2: [2.5000, 2.7500) + '{ 32'h3F5A4B68, 32'h3DAEBB3F, 32'hBC4ED637 }, // [28] pos_oct3_sub3: [2.7500, 3.0000) + '{ 32'h3F6594F3, 32'h3D64B7EA, 32'hBBFC2E52 }, // [29] pos_oct3_sub4: [3.0000, 3.2500) + '{ 32'h3F6DAF87, 32'h3D14B63C, 32'hBB996D41 }, // [30] pos_oct3_sub5: [3.2500, 3.5000) + '{ 32'h3F736AA0, 32'h3CC05DAC, 32'hBB3A76BE }, // [31] pos_oct3_sub6: [3.5000, 3.7500) + '{ 32'h3F776B87, 32'h3C77C0C9, 32'hBAE272C7 }, // [32] pos_oct3_sub7: [3.7500, 4.0000) + '{ 32'h3F7B291E, 32'h3C00F319, 32'hBA590184 }, // [33] pos_oct4_sub0: [4.0000, 4.5000) + '{ 32'h3F7DD41A, 32'h3B51D16D, 32'hB99FC02C }, // [34] pos_oct4_sub1: [4.5000, 5.0000) + '{ 32'h3F7F0B06, 32'h3AA9199D, 32'hB8EB1F74 }, // [35] pos_oct4_sub2: [5.0000, 5.5000) + '{ 32'h3F7F95A4, 32'h3A073D0F, 32'hB82D01A6 }, // [36] pos_oct4_sub3: [5.5000, 6.0000) + '{ 32'h3F7FD268, 32'h3956EC40, 32'hB77E96C4 }, // [37] pos_oct4_sub4: [6.0000, 6.5000) + '{ 32'h3F7FECAA, 32'h38A9D71D, 32'hB6BB5164 }, // [38] pos_oct4_sub5: [6.5000, 7.0000) + '{ 32'h3F7FF7E0, 32'h38059366, 32'hB609D243 }, // [39] pos_oct4_sub6: [7.0000, 7.5000) + '{ 32'h3F7FFC9E, 32'h37513C1C, 32'hB54ACE8B }, // [40] pos_oct4_sub7: [7.5000, 8.0000) + '{ 32'h3BAC0178, 32'h3F87D4B2, 32'h3E77D7C9 }, // [41] neg_oct0_sub0: [-0.2812, -0.2500) + '{ 32'h3BE5F89C, 32'h3F897100, 32'h3E8761C6 }, // [42] neg_oct0_sub1: [-0.3125, -0.2812) + '{ 32'h3C13E055, 32'h3F8B1626, 32'h3E91EAFB }, // [43] neg_oct0_sub2: [-0.3438, -0.3125) + '{ 32'h3C38044C, 32'h3F8CBAE0, 32'h3E9B7C9A }, // [44] neg_oct0_sub3: [-0.3750, -0.3438) + '{ 32'h3C5E8A68, 32'h3F8E5602, 32'h3EA40EF3 }, // [45] neg_oct0_sub4: [-0.4062, -0.3750) + '{ 32'h3C8332A5, 32'h3F8FDEA9, 32'h3EAB9D89 }, // [46] neg_oct0_sub5: [-0.4375, -0.4062) + '{ 32'h3C972F8C, 32'h3F914C67, 32'h3EB226EB }, // [47] neg_oct0_sub6: [-0.4688, -0.4375) + '{ 32'h3CAA90CC, 32'h3F929769, 32'h3EB7AC88 }, // [48] neg_oct0_sub7: [-0.5000, -0.4688) + '{ 32'h3CC48D18, 32'h3F9432F4, 32'h3EBE0A59 }, // [49] neg_oct1_sub0: [-0.5625, -0.5000) + '{ 32'h3CDEBF75, 32'h3F95A99D, 32'h3EC34674 }, // [50] neg_oct1_sub1: [-0.6250, -0.5625) + '{ 32'h3CE94A05, 32'h3F963283, 32'h3EC502F9 }, // [51] neg_oct1_sub2: [-0.6875, -0.6250) + '{ 32'h3CDEBFD9, 32'h3F95B9A5, 32'h3EC3A873 }, // [52] neg_oct1_sub3: [-0.7500, -0.6875) + '{ 32'h3CBA99F8, 32'h3F94399D, 32'h3EBFAC42 }, // [53] neg_oct1_sub4: [-0.8125, -0.7500) + '{ 32'h3C72FDCF, 32'h3F91B9DE, 32'h3EB9885F }, // [54] neg_oct1_sub5: [-0.8750, -0.8125) + '{ 32'h3B4B3567, 32'h3F8E4C54, 32'h3EB1B499 }, // [55] neg_oct1_sub6: [-0.9375, -0.8750) + '{ 32'hBC4CC375, 32'h3F8A0AB5, 32'h3EA8A16E }, // [56] neg_oct1_sub7: [-1.0000, -0.9375) + '{ 32'hBD2F53B5, 32'h3F8262A4, 32'h3E9983F9 }, // [57] neg_oct2_sub0: [-1.1250, -1.0000) + '{ 32'hBDC3067D, 32'h3F6CECEE, 32'h3E8452A4 }, // [58] neg_oct2_sub1: [-1.2500, -1.1250) + '{ 32'hBE223FD6, 32'h3F530738, 32'h3E5F3123 }, // [59] neg_oct2_sub2: [-1.3750, -1.2500) + '{ 32'hBE6A52BB, 32'h3F38CEB4, 32'h3E390648 }, // [60] neg_oct2_sub3: [-1.5000, -1.3750) + '{ 32'hBE9B0459, 32'h3F1F8DBB, 32'h3E175293 }, // [61] neg_oct2_sub4: [-1.6250, -1.5000) + '{ 32'hBEC11234, 32'h3F081E0B, 32'h3DF4E633 }, // [62] neg_oct2_sub5: [-1.7500, -1.6250) + '{ 32'hBEE60A4E, 32'h3EE5F2D2, 32'h3DC485E9 }, // [63] neg_oct2_sub6: [-1.8750, -1.7500) + '{ 32'hBF047FBC, 32'h3EC0A0B4, 32'h3D9CAC71 }, // [64] neg_oct2_sub7: [-2.0000, -1.8750) + '{ 32'hBF1BE8DE, 32'h3E924A89, 32'h3D5D9C65 }, // [65] neg_oct3_sub0: [-2.2500, -2.0000) + '{ 32'hBF365580, 32'h3E464E5F, 32'h3D097CEB }, // [66] neg_oct3_sub1: [-2.5000, -2.2500) + '{ 32'hBF4ADFA4, 32'h3E045D55, 32'h3CA91712 }, // [67] neg_oct3_sub2: [-2.7500, -2.5000) + '{ 32'hBF5A4B68, 32'h3DAEBB3F, 32'h3C4ED637 }, // [68] neg_oct3_sub3: [-3.0000, -2.7500) + '{ 32'hBF6594F3, 32'h3D64B7EA, 32'h3BFC2E52 }, // [69] neg_oct3_sub4: [-3.2500, -3.0000) + '{ 32'hBF6DAF87, 32'h3D14B63C, 32'h3B996D41 }, // [70] neg_oct3_sub5: [-3.5000, -3.2500) + '{ 32'hBF736AA0, 32'h3CC05DAC, 32'h3B3A76BE }, // [71] neg_oct3_sub6: [-3.7500, -3.5000) + '{ 32'hBF776B87, 32'h3C77C0C9, 32'h3AE272C7 }, // [72] neg_oct3_sub7: [-4.0000, -3.7500) + '{ 32'hBF7B291E, 32'h3C00F319, 32'h3A590184 }, // [73] neg_oct4_sub0: [-4.5000, -4.0000) + '{ 32'hBF7DD41A, 32'h3B51D16D, 32'h399FC02C }, // [74] neg_oct4_sub1: [-5.0000, -4.5000) + '{ 32'hBF7F0B06, 32'h3AA9199D, 32'h38EB1F74 }, // [75] neg_oct4_sub2: [-5.5000, -5.0000) + '{ 32'hBF7F95A4, 32'h3A073D0F, 32'h382D01A6 }, // [76] neg_oct4_sub3: [-6.0000, -5.5000) + '{ 32'hBF7FD268, 32'h3956EC40, 32'h377E96C4 }, // [77] neg_oct4_sub4: [-6.5000, -6.0000) + '{ 32'hBF7FECAA, 32'h38A9D71D, 32'h36BB5164 }, // [78] neg_oct4_sub5: [-7.0000, -6.5000) + '{ 32'hBF7FF7E0, 32'h38059366, 32'h3609D243 }, // [79] neg_oct4_sub6: [-7.5000, -7.0000) + '{ 32'hBF7FFC9E, 32'h37513C1C, 32'h354ACE8B } // [80] neg_oct4_sub7: [-8.0000, -7.5000) + } + }; + +endpackage diff --git a/finn-rtllib/pwpolyf/hdl/pwpolyf_tb.sv b/finn-rtllib/pwpolyf/hdl/pwpolyf_tb.sv new file mode 100644 index 0000000000..f98929e2ab --- /dev/null +++ b/finn-rtllib/pwpolyf/hdl/pwpolyf_tb.sv @@ -0,0 +1,145 @@ +/**************************************************************************** + * Copyright (C) 2026, Advanced Micro Devices, Inc. + * All rights reserved. + * + * SPDX-License-Identifier: BSD-3-Clause + * + * @brief Testbench for pwpolyf: FP32 piecewise polynomial activation. + * @author Shane Fleming + * + * @description + * Tests all four activation functions (gelu, silu, sigmoid, tanh) in + * parallel using random FP32 stimulus with online shortreal-based + * checking against a reference function. + ***************************************************************************/ + +module pwpolyf_tb; + + localparam int unsigned TEST_COUNT = 4; + localparam string FUNCS[TEST_COUNT] = '{"gelu", "silu", "sigmoid", "tanh"}; + localparam int unsigned RUNS = 4096; + + // Global Control + logic clk = 0; + always #5ns clk = !clk; + logic rst = 1; + initial begin + repeat(12) @(posedge clk); + rst <= 0; + end + + bit [TEST_COUNT-1:0] done = '0; + always_comb begin + if(&done) $finish; + end + + for(genvar t = 0; t < TEST_COUNT; t++) begin : genTests + localparam string FUNC = FUNCS[t]; + + // DUT wired for PE=1 + logic [31:0] xdat; + logic xvld; + uwire xrdy; + uwire [31:0] ydat; + uwire yvld; + logic yrdy; + + pwpolyf #(.PE(1), .FUNC(FUNC)) dut ( + .clk, .rst, + .xdat, .xvld, .xrdy, + .ydat, .yvld, .yrdy + ); + shortreal y; + assign y = $bitstoshortreal(ydat); + + // Reference function -- compute in real, cast to shortreal + function automatic shortreal ref_func(input shortreal x); + automatic real xr = real'(x); + automatic real yr; + if(xr >= 8.0) + return (FUNC == "gelu" || FUNC == "silu")? x : shortreal'(1.0); + if(xr <= -8.0) + return (FUNC == "tanh")? shortreal'(-1.0) : shortreal'(0.0); + if(FUNC == "gelu") begin + automatic real t = $tanh($sqrt(2.0/3.14159265358979) * (xr + 0.044715*xr*xr*xr)); + yr = 0.5 * xr * (1.0 + t); + end + else if(FUNC == "silu") yr = xr / (1.0 + $exp(-xr)); + else if(FUNC == "sigmoid") yr = 1.0 / (1.0 + $exp(-xr)); + else yr = $tanh(xr); + return shortreal'(yr); + endfunction + + // Online checking state + shortreal ExpQ[$]; + + // Stimulus driver + initial begin + xdat = '0; + xvld = 0; + @(posedge clk iff !rst); + + repeat(RUNS) begin + automatic logic [31:0] vbits; + + // Cover range [-8, 8) across all 5 octaves (exp 125..129) + vbits = 32'h40000000 + ($urandom() % 32'h01800000); // [2.0, 6.0) range + if($urandom() % 2) vbits[31] = 1; // random sign + if($urandom() % 4 == 0) vbits = 32'h3F800000; // 1.0 + if($urandom() % 8 == 0) vbits = 32'h00000000; // 0.0 + if($urandom() % 8 == 0) vbits = 32'h40E00000 | ($urandom() % 32'h00100000); // [7.0, 7.5) + + while($urandom() % 17 == 0) @(posedge clk); + + xdat <= vbits; + xvld <= 1; + + @(posedge clk iff xrdy); + ExpQ.push_back(ref_func($bitstoshortreal(vbits))); + + xvld <= 0; + end + end + + always_ff @(posedge clk iff yvld && yrdy) begin + automatic shortreal exp, err; + assert(ExpQ.size) else begin + $error("[%s] Spurious output.", FUNC); + $stop; + end + exp = ExpQ.pop_front(); + err = y - exp; + err *= err; + assert((err < 1e-3) || ($shortrealtobits(y) == $shortrealtobits(exp))) else begin + $error("[%s] Output mismatch: %f/%08x instead of %f/%08x", + FUNC, y, $shortrealtobits(y), exp, $shortrealtobits(exp)); + $stop; + end + end + + // Output collector -- drives yrdy backpressure + initial begin + yrdy = 0; + @(posedge clk iff !rst); + + repeat(RUNS) begin + while($urandom() % 17 == 0) @(posedge clk); + yrdy <= 1; + @(posedge clk iff yvld); + yrdy <= 0; + end + + // Verify all expected outputs were consumed + @(posedge clk); + assert(ExpQ.size() == 0) else begin + $error("[%s] Missing %0d outputs.", FUNC, ExpQ.size()); + $stop; + end + + $display("PWPOLYF[%s]: %0d outputs verified online.", FUNC, RUNS); + done[t] = 1; + end + + end : genTests + +endmodule : pwpolyf_tb diff --git a/finn-rtllib/pwpolyf/hdl/pwpolyf_template_wrapper.v b/finn-rtllib/pwpolyf/hdl/pwpolyf_template_wrapper.v new file mode 100644 index 0000000000..9bbbaa0987 --- /dev/null +++ b/finn-rtllib/pwpolyf/hdl/pwpolyf_template_wrapper.v @@ -0,0 +1,69 @@ +/****************************************************************************** + * Copyright (C) 2026, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @brief Verilog wrapper for pwpolyf IP packaging. + */ + +module $MODULE_NAME_AXI_WRAPPER$ #( + parameter PE = $PE$, + parameter FUNC = $FUNC$ +)( + (* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF in0_V:out0_V, ASSOCIATED_RESET ap_rst_n" *) + (* X_INTERFACE_INFO = "xilinx.com:signal:clock:1.0 ap_clk CLK" *) + input ap_clk, + (* X_INTERFACE_PARAMETER = "POLARITY ACTIVE_LOW" *) + input ap_rst_n, + + //- AXI Stream - Input -------------- + output in0_V_TREADY, + input in0_V_TVALID, + input [$IN_WIDTH$-1:0] in0_V_TDATA, + + //- AXI Stream - Output ------------- + input out0_V_TREADY, + output out0_V_TVALID, + output [$OUT_WIDTH$-1:0] out0_V_TDATA +); + + pwpolyf #( + .PE(PE), + .FUNC(FUNC) + ) core ( + .clk(ap_clk), + .rst(!ap_rst_n), + .xdat(in0_V_TDATA), + .xvld(in0_V_TVALID), + .xrdy(in0_V_TREADY), + .ydat(out0_V_TDATA), + .yvld(out0_V_TVALID), + .yrdy(out0_V_TREADY) + ); + +endmodule // $MODULE_NAME_AXI_WRAPPER$ diff --git a/finn-rtllib/pwpolyf/hdl/queue.sv b/finn-rtllib/pwpolyf/hdl/queue.sv new file mode 100755 index 0000000000..e5c3cf9889 --- /dev/null +++ b/finn-rtllib/pwpolyf/hdl/queue.sv @@ -0,0 +1,78 @@ +/**************************************************************************** + * Copyright (C) 2025, Advanced Micro Devices, Inc. + * All rights reserved. + * + * SPDX-License-Identifier: BSD-3-Clause + * + * @author Thomas B. Preußer + ***************************************************************************/ + +module queue #( + int unsigned DATA_WIDTH, + int unsigned ELASTICITY +)( + input logic clk, + input logic rst, + + input logic [DATA_WIDTH-1:0] idat, + input logic ivld, + output logic irdy, + + output logic [DATA_WIDTH-1:0] odat, + output logic ovld, + input logic ordy +); + + typedef logic [DATA_WIDTH-1:0] dat_t; + initial begin + if(ELASTICITY < 2) begin + $error("%m: ELASTICITY of %0d must be made 2 or above.", ELASTICITY); + $finish; + end + end + + logic signed [$clog2(ELASTICITY):0] Ptr = '1; // -1, 0, 1, ..., ELASTICITY-1 + logic Rdy = 1; + dat_t A[ELASTICITY]; + assign irdy = Rdy; + + logic Vld = 0; + dat_t B = 'x; + assign odat = B; + assign ovld = Vld; + + uwire bload = !Vld || ordy; + uwire push = Rdy && ivld; + uwire pop = !Ptr[$left(Ptr)] && bload; + + always_ff @(posedge clk) begin + if(push) A <= { idat, A[0:ELASTICITY-2] }; + end + + always_ff @(posedge clk) begin + if(rst) begin + Ptr <= '1; + Rdy <= 1; + Vld <= 0; + B <= 'x; + end + else begin + // Make sure Rdy encodes what it's supposed to: space available in queue + assert(Rdy == (Ptr < signed'(ELASTICITY-1))) else begin + $error("%m: Broken Rdy computation."); + $stop; + end + + Ptr <= Ptr + ((push == pop)? 0 : push? 1 : -1); + // pop == push: no change + // pop && !push: new space + // !pop && push: remaining space if not yet Ptr == ELASTICITY-2 + Rdy <= (pop == push)? Rdy : pop? 1 : Ptr[$left(Ptr)] || (((ELASTICITY-2) & ~Ptr[$left(Ptr)-1:0]) != 0); + if(bload) begin + Vld <= !Ptr[$left(Ptr)]; + B <= A[Ptr[$left(Ptr)-1:0]]; + end + end + end + +endmodule : queue diff --git a/finn_xsi/finn_xsi/adapter.py b/finn_xsi/finn_xsi/adapter.py index 0b73787a60..a10d7bde9c 100644 --- a/finn_xsi/finn_xsi/adapter.py +++ b/finn_xsi/finn_xsi/adapter.py @@ -47,7 +47,7 @@ def compile_sim_obj(top_module_name, source_list, sim_out_dir, debug=False, beha # sort src list so that packages are loaded first # these packages must be compiled before modules that depend on them - pkg_patterns = ["swg_pkg", "mvu_pkg"] + pkg_patterns = ["swg_pkg", "mvu_pkg", "pwpolyf_pkg"] srcs_list = sorted( source_list, key=lambda s: (not any(pkg in s for pkg in pkg_patterns), s) ) diff --git a/src/finn/builder/build_dataflow_steps.py b/src/finn/builder/build_dataflow_steps.py index 7c5d27dfb9..a89edb35ad 100644 --- a/src/finn/builder/build_dataflow_steps.py +++ b/src/finn/builder/build_dataflow_steps.py @@ -540,6 +540,13 @@ def apply_if_relevant(model, op_types, transform, desc=""): # Activation functions model = apply_if_relevant(model, ["Softmax"], to_hw.InferHWSoftmax(), "softmax layers") + # Piecewise polynomial activations (GELU, SiLU, Sigmoid, Tanh) + model = apply_if_relevant( + model, + ["PWPolyF", "Gelu", "Sigmoid", "Tanh", "Erf"], + to_hw.InferPWPolyFLayer(), + "piecewise polynomial activations", + ) # Normalization layers model = apply_if_relevant( diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py index f05198837b..c797749b04 100644 --- a/src/finn/custom_op/fpgadataflow/__init__.py +++ b/src/finn/custom_op/fpgadataflow/__init__.py @@ -69,6 +69,7 @@ def register_custom_op(cls): from finn.custom_op.fpgadataflow.matrixvectoractivation import MVAU from finn.custom_op.fpgadataflow.outer_shuffle import OuterShuffle from finn.custom_op.fpgadataflow.pool import Pool +from finn.custom_op.fpgadataflow.pwpolyf import PWPolyF from finn.custom_op.fpgadataflow.requant import Requant from finn.custom_op.fpgadataflow.shuffle import Shuffle from finn.custom_op.fpgadataflow.split import StreamingSplit @@ -103,6 +104,7 @@ def register_custom_op(cls): custom_op["Lookup"] = Lookup custom_op["OuterShuffle"] = OuterShuffle custom_op["Pool"] = Pool +custom_op["PWPolyF"] = PWPolyF custom_op["Shuffle"] = Shuffle custom_op["StreamingConcat"] = StreamingConcat custom_op["StreamingSplit"] = StreamingSplit diff --git a/src/finn/custom_op/fpgadataflow/pwpolyf.py b/src/finn/custom_op/fpgadataflow/pwpolyf.py new file mode 100644 index 0000000000..b7a683499b --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/pwpolyf.py @@ -0,0 +1,186 @@ +# Copyright Advanced Micro Devices, Inc. +# SPDX-License-Identifier: BSD-3-Clause + +import math +import numpy as np +from qonnx.core.datatype import DataType + +from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp + +# NUM_OCTAVES is fixed by the RTL segment decode and clamp range. K controls +# the number of mantissa subdivisions inside each of these fixed octaves. +_NUM_OCTAVES = 5 +_SUPPORTED_FUNCS = {"gelu", "silu", "sigmoid", "tanh"} + + +class PWPolyF(HWCustomOp): + """ + HW op for piecewise polynomial activations (GELU, SiLU, Sigmoid, Tanh). + + Element-wise FP32, coefficients baked into RTL. No weights. + """ + + def __init__(self, onnx_node, **kwargs): + super().__init__(onnx_node, **kwargs) + + def get_nodeattr_types(self): + my_attrs = { + # activation function: gelu, silu, sigmoid, tanh + "func": ("s", True, ""), + # top-mantissa subdivision bits (K=3 gives 81 segments) + "K": ("i", False, 3), + # parallelism; elements processed per cycle + "PE": ("i", True, 0), + # number of channels (last dimension of input tensor) + "NumChannels": ("i", True, 0), + # FINN DataTypes for inputs, outputs (always FLOAT32) + "inputDataType": ("s", True, ""), + "outputDataType": ("s", True, ""), + # polynomial degree (number of FMA stages per PE) + "degree": ("i", False, 2), + # number of input vectors, examples: + # [1] is a single vector (like a FC layer with batch=1) + # [4] is four vectors (like a FC layer with batch=4) + # [1, 4, 4] is four * four vectors (like a conv layer with batch=1) + "numInputVectors": ("ints", False, [1]), + } + my_attrs.update(super().get_nodeattr_types()) + return my_attrs + + def get_num_segments(self): + K = self.get_nodeattr("K") + return 1 + 2 * _NUM_OCTAVES * (1 << K) + + def make_shape_compatible_op(self, model): + oshape = self.get_normal_output_shape() + return super().make_const_shape_op(oshape) + + def infer_node_datatype(self, model): + node = self.onnx_node + idt = model.get_tensor_datatype(node.input[0]) + assert idt == DataType["FLOAT32"], "%s: PWPolyF requires FLOAT32 input, got %s" % ( + node.name, + idt, + ) + self.set_nodeattr("inputDataType", idt.name) + self.set_nodeattr("outputDataType", idt.name) + model.set_tensor_datatype(node.output[0], idt) + + def verify_node(self): + info_messages = [] + + backend_value = self.get_nodeattr("backend") + if backend_value == "fpgadataflow": + info_messages.append("Attribute backend is set correctly") + else: + info_messages.append('Attribute backend should be set to "fpgadataflow"') + + func = self.get_nodeattr("func") + if func in _SUPPORTED_FUNCS: + info_messages.append("Attribute func is set correctly") + else: + info_messages.append( + "Attribute func must be one of %s, got %s" % (_SUPPORTED_FUNCS, func) + ) + + pe = self.get_nodeattr("PE") + nch = self.get_nodeattr("NumChannels") + if pe > 0 and nch > 0 and nch % pe == 0: + info_messages.append("PE divides NumChannels") + else: + info_messages.append("PE must divide NumChannels evenly") + + idt = self.get_nodeattr("inputDataType") + if idt != "FLOAT32": + info_messages.append("PWPolyF requires FLOAT32 input, got %s" % idt) + odt = self.get_nodeattr("outputDataType") + if odt != "FLOAT32": + info_messages.append("PWPolyF requires FLOAT32 output, got %s" % odt) + + return info_messages + + def get_input_datatype(self, ind=0): + """Returns FINN DataType of input.""" + return DataType[self.get_nodeattr("inputDataType")] + + def get_output_datatype(self, ind=0): + """Returns FINN DataType of output.""" + return DataType[self.get_nodeattr("outputDataType")] + + def get_instream_width(self, ind=0): + return self.get_input_datatype().bitwidth() * self.get_nodeattr("PE") + + def get_outstream_width(self, ind=0): + return self.get_output_datatype().bitwidth() * self.get_nodeattr("PE") + + def get_folded_input_shape(self, ind=0): + pe = self.get_nodeattr("PE") + nch = self.get_nodeattr("NumChannels") + fold = nch // pe + vecs = list(self.get_nodeattr("numInputVectors")) + return tuple(vecs + [fold, pe]) + + def get_folded_output_shape(self, ind=0): + return self.get_folded_input_shape() + + def get_normal_input_shape(self, ind=0): + nch = self.get_nodeattr("NumChannels") + vecs = list(self.get_nodeattr("numInputVectors")) + return tuple(vecs + [nch]) + + def get_normal_output_shape(self, ind=0): + return self.get_normal_input_shape() + + def get_exp_cycles(self): + # II=1, latency amortised over stream length + return np.prod(self.get_folded_output_shape()[:-1]) + + def lut_estimation(self): + pe = self.get_nodeattr("PE") + degree = self.get_nodeattr("degree") + return 100 * degree * pe + + def bram_estimation(self): + pe = self.get_nodeattr("PE") + degree = self.get_nodeattr("degree") + num_segs = self.get_num_segments() + + if degree <= 1: + return 0 + + # Stages after the first use a registered dynamic coefficient lookup + # for the DSP C input. Vivado infers this as one 32-bit wide ROM per + # stage and PE, backed by RAMB18 for the default K=3 table depth. + coeff_width = 32 + if coeff_width <= 18 or num_segs > 512: + bram18_per_coeff_rom = math.ceil(num_segs / 1024) * math.ceil(coeff_width / 18) + else: + bram18_per_coeff_rom = math.ceil(num_segs / 512) * math.ceil(coeff_width / 36) + return pe * (degree - 1) * bram18_per_coeff_rom + + def uram_estimation(self): + return 0 + + def dsp_estimation(self, fpgapart=None): + pe = self.get_nodeattr("PE") + degree = self.get_nodeattr("degree") + return degree * pe + + def execute_node(self, context, graph): + node = self.onnx_node + inp = context[node.input[0]] + + func = self.get_nodeattr("func") + K = self.get_nodeattr("K") + + # lazy import to avoid hard dependency on torch at module level + import torch # noqa: PLC0415 + + from finn.util.torch_hw_modules import PiecewisePolyActivation # noqa: PLC0415 + + degree = self.get_nodeattr("degree") + mod = PiecewisePolyActivation(func, K=K, degree=degree) + with torch.no_grad(): + x = torch.from_numpy(inp.astype(np.float32)) + y = mod(x) + context[node.output[0]] = y.numpy() diff --git a/src/finn/custom_op/fpgadataflow/rtl/__init__.py b/src/finn/custom_op/fpgadataflow/rtl/__init__.py index 520fcdcd12..053b8e8f02 100644 --- a/src/finn/custom_op/fpgadataflow/rtl/__init__.py +++ b/src/finn/custom_op/fpgadataflow/rtl/__init__.py @@ -39,6 +39,7 @@ from finn.custom_op.fpgadataflow.rtl.inner_shuffle_rtl import InnerShuffle_rtl from finn.custom_op.fpgadataflow.rtl.layernorm_rtl import LayerNorm_rtl from finn.custom_op.fpgadataflow.rtl.matrixvectoractivation_rtl import MVAU_rtl +from finn.custom_op.fpgadataflow.rtl.pwpolyf_rtl import PWPolyF_rtl from finn.custom_op.fpgadataflow.rtl.requant_rtl import Requant_rtl from finn.custom_op.fpgadataflow.rtl.streamingdatawidthconverter_rtl import ( StreamingDataWidthConverter_rtl, @@ -61,6 +62,7 @@ custom_op["StreamingFIFO_rtl"] = StreamingFIFO_rtl custom_op["MVAU_rtl"] = MVAU_rtl custom_op["VVAU_rtl"] = VVAU_rtl +custom_op["PWPolyF_rtl"] = PWPolyF_rtl custom_op["Thresholding_rtl"] = Thresholding_rtl custom_op["InnerShuffle_rtl"] = InnerShuffle_rtl custom_op["Requant_rtl"] = Requant_rtl diff --git a/src/finn/custom_op/fpgadataflow/rtl/pwpolyf_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/pwpolyf_rtl.py new file mode 100644 index 0000000000..5dfa730bc1 --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/rtl/pwpolyf_rtl.py @@ -0,0 +1,181 @@ +# Copyright Advanced Micro Devices, Inc. +# SPDX-License-Identifier: BSD-3-Clause + +import numpy as np +import os +import shutil +from qonnx.core.datatype import DataType + +from finn.custom_op.fpgadataflow.pwpolyf import PWPolyF +from finn.custom_op.fpgadataflow.rtlbackend import RTLBackend +from finn.util.data_packing import array2hexstring +from finn.util.torch_hw_modules import ( + CLAMP_CFG, + NUM_OCTAVES, + SUPPORTED_FUNCS, + _fit_coefficients, +) + + +def _float_to_hex(f): + """Convert a Python float to a 32-bit IEEE 754 hex string.""" + return array2hexstring(np.array([f]), DataType["FLOAT32"], 32, prefix="").upper() + + +def _generate_coeffs_pkg_data(K, degree=2, num_samples=1000): + """Generate the pwpolyf_pkg.sv package content for a given K value. + + Produces a SystemVerilog package with a func_cfg_t struct per activation + function, containing clamping parameters and polynomial coefficients. + """ + num_subs = 1 << K + num_segs = 1 + 2 * NUM_OCTAVES * num_subs + + lines = [] + lines.append("// Auto-generated by pwpolyf_rtl.py - do not edit manually.") + lines.append( + "// DEGREE=%d K=%d NUM_OCTAVES=%d Segments: %d" % (degree, K, NUM_OCTAVES, num_segs) + ) + lines.append("") + lines.append("package pwpolyf_pkg;") + lines.append("") + lines.append(" localparam int unsigned DEGREE = %d;" % degree) + lines.append(" localparam int unsigned K = %d;" % K) + lines.append(" localparam int unsigned NUM_OCTAVES = %d;" % NUM_OCTAVES) + lines.append(" localparam int unsigned NUM_SEGS = %d;" % num_segs) + lines.append("") + lines.append(" typedef struct {") + lines.append(" int unsigned neg_clamp;") + lines.append(" int unsigned pos_clamp;") + lines.append(" bit pos_passthrough;") + lines.append(" int unsigned coeffs[NUM_SEGS][DEGREE+1];") + lines.append(" } func_cfg_t;") + + for func_name in SUPPORTED_FUNCS: + cfg = CLAMP_CFG[func_name] + coeffs = _fit_coefficients(func_name, K, degree=degree, num_samples=num_samples) + label = func_name.upper() + neg_hex = _float_to_hex(cfg["neg_clamp"]) + pos_hex = _float_to_hex(cfg["pos_clamp"]) + passthrough = 1 if cfg["pos_passthrough"] else 0 + + lines.append("") + lines.append(" localparam func_cfg_t %s = '{" % label) + lines.append(" neg_clamp: 32'h%s," % neg_hex) + lines.append(" pos_clamp: 32'h%s," % pos_hex) + lines.append(" pos_passthrough: %d," % passthrough) + lines.append(" coeffs: '{") + for seg in range(num_segs): + coeff_strs = [] + for c in range(degree + 1): + coeff_strs.append("32'h%s" % _float_to_hex(coeffs[seg, c])) + comma = "," if seg < num_segs - 1 else "" + lines.append(" '{ %s }%s\t// seg %d" % (", ".join(coeff_strs), comma, seg)) + lines.append(" }") + lines.append(" };") + + lines.append("") + lines.append("endpackage") + lines.append("") + return "\n".join(lines) + + +class PWPolyF_rtl(PWPolyF, RTLBackend): + """RTL variant of PWPolyF, wraps the finn-rtllib pwpolyf IP.""" + + def __init__(self, onnx_node, **kwargs): + super().__init__(onnx_node, **kwargs) + + def get_nodeattr_types(self): + my_attrs = {} + my_attrs.update(PWPolyF.get_nodeattr_types(self)) + my_attrs.update(RTLBackend.get_nodeattr_types(self)) + return my_attrs + + def _generate_coeffs_pkg(self, num_samples=1000): + K = self.get_nodeattr("K") + degree = self.get_nodeattr("degree") + return _generate_coeffs_pkg_data(K, degree=degree, num_samples=num_samples) + + def generate_hdl(self, model, fpgapart, clk): + rtllib_dir = os.path.join(os.environ["FINN_ROOT"], "finn-rtllib/pwpolyf/hdl/") + template_path = rtllib_dir + "pwpolyf_template_wrapper.v" + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + + pe = self.get_nodeattr("PE") + func = self.get_nodeattr("func") + topname = self.get_verilog_top_module_name() + + self.set_nodeattr("gen_top_module", topname) + + code_gen_dict = { + "$MODULE_NAME_AXI_WRAPPER$": topname, + "$TOP_MODULE$": topname, + "$PE$": str(pe), + "$FUNC$": '"%s"' % func, + "$IN_WIDTH$": str(pe * 32), + "$OUT_WIDTH$": str(pe * 32), + } + + # apply code generation to wrapper template + with open(template_path, "r") as f: + template = f.read() + for key, value in code_gen_dict.items(): + template = template.replace(key, str(value)) + with open(os.path.join(code_gen_dir, topname + ".v"), "w") as f: + f.write(template) + + # copy RTL source files + for sv_file in ["pwpolyf.sv", "queue.sv"]: + shutil.copy(rtllib_dir + sv_file, code_gen_dir) + + # generate package with coefficients matching the node's K and degree + pkg_data = self._generate_coeffs_pkg() + with open(os.path.join(code_gen_dir, "pwpolyf_pkg.sv"), "w") as f: + f.write(pkg_data) + + self.set_nodeattr("ipgen_path", code_gen_dir) + self.set_nodeattr("ip_path", code_gen_dir) + + def get_rtl_file_list(self, abspath=False): + if abspath: + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + "/" + rtllib_dir = os.path.join(os.environ["FINN_ROOT"], "finn-rtllib/pwpolyf/hdl/") + else: + code_gen_dir = "" + rtllib_dir = "" + + verilog_files = [ + code_gen_dir + "pwpolyf_pkg.sv", + rtllib_dir + "pwpolyf.sv", + rtllib_dir + "queue.sv", + code_gen_dir + self.get_nodeattr("gen_top_module") + ".v", + ] + return verilog_files + + def execute_node(self, context, graph): + mode = self.get_nodeattr("exec_mode") + if mode == "cppsim": + PWPolyF.execute_node(self, context, graph) + elif mode == "rtlsim": + RTLBackend.execute_node(self, context, graph) + + def code_generation_ipi(self): + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + + sourcefiles = [ + "pwpolyf_pkg.sv", + "pwpolyf.sv", + "queue.sv", + ] + sourcefiles.append(self.get_nodeattr("gen_top_module") + ".v") + sourcefiles = [os.path.join(code_gen_dir, f) for f in sourcefiles] + + cmd = [] + for f in sourcefiles: + cmd += ["add_files -norecurse %s" % (f)] + cmd += [ + "create_bd_cell -type module -reference %s %s" + % (self.get_nodeattr("gen_top_module"), self.onnx_node.name) + ] + return cmd diff --git a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py index f7b7beee14..185dc73e06 100644 --- a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py +++ b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py @@ -1,4 +1,4 @@ -# Copyright (C) 2023-2024, Advanced Micro Devices, Inc. +# Copyright (C) 2023-2026, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -256,6 +256,215 @@ def apply(self, model): return (model, graph_modified) +class InferPWPolyFLayer(Transformation): + """Convert activations to piecewise polynomial HW layers.""" + + _SINGLE_OP_MAP = {"Gelu": "gelu", "Tanh": "tanh"} + + def __init__(self): + super().__init__() + + @staticmethod + def _is_const_scalar(model, tensor_name, value, tol=1e-3): + """Check if *tensor_name* is a constant initializer equal to *value*.""" + init = model.get_initializer(tensor_name) + if init is None: + return False + return init.size == 1 and abs(float(init.flat[0]) - value) < tol + + def _match_erf_gelu(self, model, erf_node): + """Match Erf-based GELU: Div(x,sqrt(2))→Erf→Add(_,1)→Mul(0.5,_)→Mul(x,_). + Returns (pwp_input, pwp_output, nodes_to_remove) or None.""" + # backward: Erf input must come from Div(x, sqrt(2)) + div_node = model.find_producer(erf_node.input[0]) + if div_node is None or div_node.op_type != "Div": + return None + if self._is_const_scalar(model, div_node.input[1], 1.4142135): + gelu_input = div_node.input[0] + elif self._is_const_scalar(model, div_node.input[0], 1.4142135): + gelu_input = div_node.input[1] + else: + return None + + # forward: Erf → Add(_, 1) + erf_consumers = model.find_consumers(erf_node.output[0]) + if len(erf_consumers) != 1 or erf_consumers[0].op_type != "Add": + return None + add_node = erf_consumers[0] + other_add = [i for i in add_node.input if i != erf_node.output[0]] + if len(other_add) != 1 or not self._is_const_scalar(model, other_add[0], 1.0): + return None + + # Add → Mul(0.5, _) + add_consumers = model.find_consumers(add_node.output[0]) + if len(add_consumers) != 1 or add_consumers[0].op_type != "Mul": + return None + mul_half = add_consumers[0] + other_mul_half = [i for i in mul_half.input if i != add_node.output[0]] + if len(other_mul_half) != 1 or not self._is_const_scalar(model, other_mul_half[0], 0.5): + return None + + # Mul(0.5,_) → Mul(x, _) + half_consumers = model.find_consumers(mul_half.output[0]) + if len(half_consumers) != 1 or half_consumers[0].op_type != "Mul": + return None + mul_x = half_consumers[0] + other_mul_x = [i for i in mul_x.input if i != mul_half.output[0]] + if len(other_mul_x) != 1 or other_mul_x[0] != gelu_input: + return None + + nodes_to_remove = [div_node, erf_node, add_node, mul_half, mul_x] + return (gelu_input, mul_x.output[0], nodes_to_remove) + + @staticmethod + def _make_pwpolyf_node(pwp_input, pwp_output, func, in_shape, idt, name, K=3, degree=2): + num_channels = in_shape[-1] + return helper.make_node( + "PWPolyF", + [pwp_input], + [pwp_output], + domain="finn.custom_op.fpgadataflow", + backend="fpgadataflow", + func=func, + K=K, + degree=degree, + NumChannels=num_channels, + PE=1, + inputDataType=idt.name, + outputDataType=idt.name, + numInputVectors=list(in_shape[:-1]), + name=name, + ) + + def apply(self, model): + graph = model.graph + node_ind = 0 + graph_modified = False + for node in graph.node: + node_ind += 1 + + # Case 1: PWPolyF custom op (dynamo=False export path) + if node.op_type == "PWPolyF" and node.domain != "finn.custom_op.fpgadataflow": + pwp_input = node.input[0] + pwp_output = node.output[0] + pwp_in_shape = model.get_tensor_shape(pwp_input) + idt = model.get_tensor_datatype(pwp_input) + + func = get_by_name(node.attribute, "func").s.decode("utf-8") + K_attr = get_by_name(node.attribute, "K") + K = K_attr.i if K_attr is not None else 3 + degree_attr = get_by_name(node.attribute, "degree") + degree = degree_attr.i if degree_attr is not None else 2 + + new_node = self._make_pwpolyf_node( + pwp_input, + pwp_output, + func, + pwp_in_shape, + idt, + "PWPolyF_" + node.name, + K, + degree, + ) + graph.node.insert(node_ind, new_node) + graph.node.remove(node) + graph_modified = True + + # Case 2: single-node standard ONNX activations (Gelu, Tanh) + elif node.op_type in self._SINGLE_OP_MAP: + pwp_input = node.input[0] + pwp_output = node.output[0] + pwp_in_shape = model.get_tensor_shape(pwp_input) + if pwp_in_shape is None or len(pwp_in_shape) < 1: + continue + idt = model.get_tensor_datatype(pwp_input) + if idt != DataType["FLOAT32"]: + continue + + func = self._SINGLE_OP_MAP[node.op_type] + new_node = self._make_pwpolyf_node( + pwp_input, + pwp_output, + func, + pwp_in_shape, + idt, + "PWPolyF_" + node.name, + ) + graph.node.insert(node_ind, new_node) + graph.node.remove(node) + graph_modified = True + + # Case 3: Sigmoid — standalone or part of SiLU pattern + elif node.op_type == "Sigmoid": + sig_input = node.input[0] + sig_output = node.output[0] + pwp_in_shape = model.get_tensor_shape(sig_input) + if pwp_in_shape is None or len(pwp_in_shape) < 1: + continue + idt = model.get_tensor_datatype(sig_input) + if idt != DataType["FLOAT32"]: + continue + + nodes_to_remove = [node] + func = "sigmoid" + pwp_output = sig_output + + # Probe for SiLU: Sigmoid feeds a Mul whose other input + # is the same tensor x that enters the Sigmoid. + sig_consumers = model.find_consumers(sig_output) + if len(sig_consumers) == 1: + mul_cand = sig_consumers[0] + if mul_cand.op_type == "Mul": + mul_inputs = list(mul_cand.input) + other_idx = 1 if mul_inputs[0] == sig_output else 0 + if mul_inputs[other_idx] == sig_input: + func = "silu" + pwp_output = mul_cand.output[0] + nodes_to_remove.append(mul_cand) + + new_node = self._make_pwpolyf_node( + sig_input, + pwp_output, + func, + pwp_in_shape, + idt, + "PWPolyF_" + node.name, + ) + graph.node.insert(node_ind, new_node) + for nd in nodes_to_remove: + graph.node.remove(nd) + graph_modified = True + + # Case 4: Erf-based GELU (dynamo=True / opset < 20) + # Div(x, sqrt(2)) → Erf → Add(_, 1) → Mul(0.5, _) → Mul(x, _) + elif node.op_type == "Erf": + match = self._match_erf_gelu(model, node) + if match is None: + continue + pwp_input, pwp_output, nodes_to_remove = match + pwp_in_shape = model.get_tensor_shape(pwp_input) + if pwp_in_shape is None or len(pwp_in_shape) < 1: + continue + idt = model.get_tensor_datatype(pwp_input) + if idt != DataType["FLOAT32"]: + continue + + new_node = self._make_pwpolyf_node( + pwp_input, + pwp_output, + "gelu", + pwp_in_shape, + idt, + "PWPolyF_" + node.name, + ) + graph.node.insert(node_ind, new_node) + for nd in nodes_to_remove: + graph.node.remove(nd) + graph_modified = True + + return (model, graph_modified) + + def _check_uniform_thresholds(thresholds, rtol=1e-2): """Check if thresholds have uniform (equal) step sizes per channel. diff --git a/src/finn/transformation/fpgadataflow/set_folding.py b/src/finn/transformation/fpgadataflow/set_folding.py index e413730796..612921c8be 100644 --- a/src/finn/transformation/fpgadataflow/set_folding.py +++ b/src/finn/transformation/fpgadataflow/set_folding.py @@ -127,6 +127,7 @@ def apply(self, model): pe_ops = [ "DuplicateStreams_hls", "GlobalAccPool_hls", + "PWPolyF_rtl", "Thresholding_hls", "Thresholding_rtl", *ELEMENTWISE_BINARY_OPS, diff --git a/src/finn/transformation/fpgadataflow/specialize_layers.py b/src/finn/transformation/fpgadataflow/specialize_layers.py index dcd2472e0a..5c0dd3a0cb 100644 --- a/src/finn/transformation/fpgadataflow/specialize_layers.py +++ b/src/finn/transformation/fpgadataflow/specialize_layers.py @@ -82,6 +82,11 @@ def _determine_impl_style(node, fpgapart, model): return "rtl" else: return "hls" + elif optype == "PWPolyF": + if _pwpolyf_rtl_possible(node, fpgapart): + return "rtl" + else: + _raise_pwpolyf_unsupported(node, fpgapart) elif optype == "Requant": if _requant_rtl_possible(node, fpgapart): return "rtl" @@ -106,6 +111,8 @@ def _determine_impl_style(node, fpgapart, model): if hls_variant: return "hls" elif rtl_variant: + if optype == "PWPolyF" and not _pwpolyf_rtl_possible(node, fpgapart): + _raise_pwpolyf_unsupported(node, fpgapart) warn_str = """There is no HLS variant of %s. Node %s will automatically be set to RTL variant.""" % ( node.op_type, @@ -158,6 +165,11 @@ def _determine_impl_style(node, fpgapart, model): warnings.warn(warn_str) return "hls" + elif optype == "PWPolyF": + if _pwpolyf_rtl_possible(node, fpgapart): + return "rtl" + else: + _raise_pwpolyf_unsupported(node, fpgapart) elif optype == "LayerNorm": if _layernorm_rtl_possible(node, fpgapart): return "rtl" @@ -346,6 +358,20 @@ def _layernorm_rtl_possible(n, fpgapart): return True +def _pwpolyf_rtl_possible(n, fpgapart): + # PWPolyF uses the Versal DSPFP32 primitive. + return is_versal(fpgapart) + + +def _raise_pwpolyf_unsupported(n, fpgapart): + raise Exception( + """PWPolyF node %s cannot be specialized for FPGA part %s. + PWPolyF_rtl uses the Versal DSPFP32 primitive and is only supported + on Versal devices.""" + % (n.name, fpgapart) + ) + + def _requant_rtl_possible(n, fpgapart): # Checks whether RTL-based Requant is supported # RTL Requant requires: diff --git a/src/finn/util/pwpolyf.py b/src/finn/util/pwpolyf.py new file mode 100644 index 0000000000..0c426db05b --- /dev/null +++ b/src/finn/util/pwpolyf.py @@ -0,0 +1,39 @@ +# Copyright Advanced Micro Devices, Inc. +# SPDX-License-Identifier: BSD-3-Clause + +"""Compatibility imports for PWPolyF PyTorch utilities. + +The canonical home for PyTorch modules that match FINN hardware behavior is +``finn.util.torch_hw_modules``. This module is kept to avoid breaking existing +imports while downstream code moves to the new location. +""" + +from finn.util.torch_hw_modules import ( + CLAMP_CFG, + EXP_BASE, + EXP_BIAS, + EXP_CLAMP, + NUM_OCTAVES, + REFERENCE_FUNCS, + SUPPORTED_FUNCS, + PiecewisePolyActivation, + PWPolyFFunction, + _fit_coefficients, + _segment_boundaries, + _segment_index, +) + +__all__ = [ + "CLAMP_CFG", + "EXP_BIAS", + "EXP_BASE", + "EXP_CLAMP", + "NUM_OCTAVES", + "PWPolyFFunction", + "PiecewisePolyActivation", + "REFERENCE_FUNCS", + "SUPPORTED_FUNCS", + "_fit_coefficients", + "_segment_boundaries", + "_segment_index", +] diff --git a/src/finn/util/torch_hw_modules.py b/src/finn/util/torch_hw_modules.py new file mode 100644 index 0000000000..d73ae16f0c --- /dev/null +++ b/src/finn/util/torch_hw_modules.py @@ -0,0 +1,212 @@ +# Copyright Advanced Micro Devices, Inc. +# SPDX-License-Identifier: BSD-3-Clause + +""" +PyTorch modules that match FINN hardware-layer behavior. + +These modules are intended as drop-in PyTorch layers for modelling the +functional behavior of FINN hardware layers before conversion to HWCustomOps. +""" + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F + +# Constants matching the SystemVerilog pwpolyf module +NUM_OCTAVES = 5 +EXP_BIAS = 127 +EXP_BASE = 125 +EXP_CLAMP = 130 + +SUPPORTED_FUNCS = ("gelu", "silu", "sigmoid", "tanh") + +REFERENCE_FUNCS = { + "gelu": lambda x: F.gelu(x), + "silu": lambda x: F.silu(x), + "sigmoid": lambda x: torch.sigmoid(x), + "tanh": lambda x: torch.tanh(x), +} + +CLAMP_CFG = { + "gelu": {"neg_clamp": 0.0, "pos_clamp": 0.0, "pos_passthrough": True}, + "silu": {"neg_clamp": 0.0, "pos_clamp": 0.0, "pos_passthrough": True}, + "sigmoid": {"neg_clamp": 0.0, "pos_clamp": 1.0, "pos_passthrough": False}, + "tanh": {"neg_clamp": -1.0, "pos_clamp": 1.0, "pos_passthrough": False}, +} + + +def _segment_boundaries(K): + """Return (lo, hi) bounds for every PWPolyF segment.""" + num_subs = 1 << K + bounds = [] + + # Segment 0: near-zero + bounds.append((-0.25, 0.25)) + + # Positive segments + for octave in range(NUM_OCTAVES): + exp_val = EXP_BASE + octave - EXP_BIAS + base = 2.0**exp_val + for sub in range(num_subs): + lo = base * (1.0 + sub / num_subs) + hi = base * (1.0 + (sub + 1) / num_subs) + bounds.append((lo, hi)) + + # Negative segments (mirror of positive) + for octave in range(NUM_OCTAVES): + exp_val = EXP_BASE + octave - EXP_BIAS + base = 2.0**exp_val + for sub in range(num_subs): + lo = base * (1.0 + sub / num_subs) + hi = base * (1.0 + (sub + 1) / num_subs) + bounds.append((-hi, -lo)) + + return bounds + + +def _fit_coefficients(func_name, K, degree=2, num_samples=1000): + """Fit degree-N polynomials per segment. Returns a (segments, degree+1) tensor.""" + ref_fn = REFERENCE_FUNCS[func_name] + bounds = _segment_boundaries(K) + num_segs = len(bounds) + coeffs = np.zeros((num_segs, degree + 1), dtype=np.float64) + + for seg, (lo, hi) in enumerate(bounds): + xs = np.linspace(lo, hi, num_samples, dtype=np.float64) + with torch.no_grad(): + ys = ref_fn(torch.from_numpy(xs).float()).numpy().astype(np.float64) + c = np.polynomial.polynomial.polyfit(xs, ys, deg=degree) + coeffs[seg] = c[: degree + 1] + + return torch.from_numpy(coeffs.astype(np.float32)) + + +def _segment_index(x, K, num_subs, num_segs): + """Map each element to its polynomial segment, mirroring SV addressing.""" + abs_x = x.abs() + is_neg = x < 0 + + is_near_zero = abs_x < 0.25 + is_clamp = abs_x >= 8.0 + is_neg_clamp = is_neg & is_clamp + is_pos_clamp = (~is_neg) & is_clamp + + safe_abs = abs_x.clamp(min=0.25) + floor_log2 = torch.floor(torch.log2(safe_abs)) + octave = (floor_log2 + 2).long().clamp(0, NUM_OCTAVES - 1) + + pow2 = torch.exp2(floor_log2) + frac = safe_abs / pow2 - 1.0 + sub = (frac * num_subs).long().clamp(0, num_subs - 1) + + pos_idx = 1 + octave * num_subs + sub + neg_idx = 1 + NUM_OCTAVES * num_subs + octave * num_subs + sub + + seg_idx = torch.where( + is_near_zero, + torch.zeros_like(pos_idx), + torch.where(is_neg, neg_idx, pos_idx), + ) + seg_idx = seg_idx.clamp(0, num_segs - 1) + + return seg_idx, is_neg_clamp, is_pos_clamp + + +class PWPolyFFunction(torch.autograd.Function): + """Emit a single PWPolyF ONNX node during legacy torch.onnx export.""" + + @staticmethod + def forward(ctx, x, coeffs, neg_clamp_val, pos_clamp_val, func, K, degree): + num_subs = 1 << K + num_segs = 1 + 2 * NUM_OCTAVES * num_subs + degree = int(degree) + pos_passthrough = CLAMP_CFG[func]["pos_passthrough"] + + orig_shape = x.shape + x_flat = x.contiguous().view(-1) + + seg_idx, is_neg_clamp, is_pos_clamp = _segment_index(x_flat, K, num_subs, num_segs) + + c = coeffs[seg_idx] + # Horner evaluation: y = c0 + x*(c1 + x*(c2 + ...)) + y = c[:, degree] + for i in range(degree - 1, -1, -1): + y = c[:, i] + x_flat * y + + if pos_passthrough: + pos_val = x_flat + else: + pos_val = pos_clamp_val.expand_as(y) + y = torch.where(is_pos_clamp, pos_val, y) + y = torch.where(is_neg_clamp, neg_clamp_val.expand_as(y), y) + + return y.view(orig_shape) + + @staticmethod + def symbolic(g, x, coeffs, neg_clamp_val, pos_clamp_val, func, K, degree): + return g.op("PWPolyF", x, func_s=func, K_i=K, degree_i=degree) + + +class PiecewisePolyActivation(nn.Module): + """ + Drop-in activation matching FINN's PWPolyF RTL behavior. + + Approximates nonlinear activations using piecewise polynomials over + segments defined by FP32 bit extraction. The polynomial is evaluated via + Horner's method to match the DSPFP32 FMA chain used by the RTL. + """ + + def __init__(self, func="gelu", K=3, degree=2, fit_samples=1000): + super().__init__() + if func not in SUPPORTED_FUNCS: + raise ValueError("Unsupported func=%r; choose from %s" % (func, SUPPORTED_FUNCS)) + + self.func = func + self.K = K + self.degree = degree + self.num_subs = 1 << K + self.num_segs = 1 + 2 * NUM_OCTAVES * self.num_subs + self.pos_passthrough = CLAMP_CFG[func]["pos_passthrough"] + + coeffs = _fit_coefficients(func, K, degree=degree, num_samples=fit_samples) + self.register_buffer("coeffs", coeffs) + + neg_cv = torch.tensor(CLAMP_CFG[func]["neg_clamp"], dtype=torch.float32) + pos_cv = torch.tensor(CLAMP_CFG[func]["pos_clamp"], dtype=torch.float32) + self.register_buffer("neg_clamp_val", neg_cv) + self.register_buffer("pos_clamp_val", pos_cv) + + def forward(self, x): + if torch.onnx.is_in_onnx_export(): + return PWPolyFFunction.apply( + x, + self.coeffs, + self.neg_clamp_val, + self.pos_clamp_val, + self.func, + self.K, + self.degree, + ) + + orig_shape = x.shape + x_flat = x.contiguous().view(-1) + + seg_idx, is_neg_clamp, is_pos_clamp = _segment_index( + x_flat, self.K, self.num_subs, self.num_segs + ) + + c = self.coeffs[seg_idx] + # Horner evaluation: y = c0 + x*(c1 + x*(c2 + ...)) + y = c[:, self.degree] + for i in range(self.degree - 1, -1, -1): + y = c[:, i] + x_flat * y + + if self.pos_passthrough: + pos_val = x_flat + else: + pos_val = self.pos_clamp_val.expand_as(y) + y = torch.where(is_pos_clamp, pos_val, y) + y = torch.where(is_neg_clamp, self.neg_clamp_val.expand_as(y), y) + + return y.view(orig_shape) diff --git a/tests/fpgadataflow/test_fpgadataflow_pwpolyf.py b/tests/fpgadataflow/test_fpgadataflow_pwpolyf.py new file mode 100644 index 0000000000..b9de975778 --- /dev/null +++ b/tests/fpgadataflow/test_fpgadataflow_pwpolyf.py @@ -0,0 +1,739 @@ +# Copyright Advanced Micro Devices, Inc. +# SPDX-License-Identifier: BSD-3-Clause + +import pytest + +import numpy as np +import os +import tempfile +import torch +from onnx import TensorProto, helper +from qonnx.core.modelwrapper import ModelWrapper +from qonnx.custom_op.registry import getCustomOp +from qonnx.transformation.general import GiveUniqueNodeNames +from qonnx.transformation.infer_datatypes import InferDataTypes +from qonnx.transformation.infer_shapes import InferShapes + +import finn.core.onnx_exec as oxe +from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer +from finn.transformation.fpgadataflow.convert_to_hw_layers import InferPWPolyFLayer +from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP +from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP +from finn.transformation.fpgadataflow.prepare_ip import PrepareIP +from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim +from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.set_fifo_depths import InsertAndSetFIFODepths +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers +from finn.util.torch_hw_modules import PiecewisePolyActivation + +test_fpga_part = "xcvc1902-vsva2197-2MP-e-S" +non_versal_fpga_part = "xczu3eg-sbva484-1-e" +target_clk_ns = 5 + + +def make_pwpolyf_modelwrapper(func, K, num_channels, num_input_vecs): + inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, num_input_vecs + [num_channels]) + outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, num_input_vecs + [num_channels]) + + pwpolyf_node = helper.make_node( + "PWPolyF", + ["inp"], + ["outp"], + domain="finn.custom_op.fpgadataflow", + backend="fpgadataflow", + func=func, + K=K, + NumChannels=num_channels, + PE=1, + inputDataType="FLOAT32", + outputDataType="FLOAT32", + numInputVectors=num_input_vecs, + name="PWPolyF_0", + ) + + graph = helper.make_graph( + nodes=[pwpolyf_node], + name="pwpolyf_graph", + inputs=[inp], + outputs=[outp], + ) + model = helper.make_model(graph, producer_name="pwpolyf-test") + model = ModelWrapper(model) + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) + model = model.transform(GiveUniqueNodeNames()) + return model + + +def make_pwpolyf_rtl_inst(K=3, degree=2): + model = make_pwpolyf_modelwrapper("gelu", K, 4, [1]) + model = model.transform(SpecializeLayers(test_fpga_part)) + inst = getCustomOp(model.graph.node[0]) + inst.set_nodeattr("degree", degree) + return inst + + +@pytest.mark.parametrize("func", ["gelu", "silu", "sigmoid", "tanh"]) +@pytest.mark.parametrize("num_channels", [4, 16]) +@pytest.mark.parametrize("num_input_vecs", [[1], [1, 2, 2]]) +@pytest.mark.parametrize("fold", [-1, 1, 2]) +@pytest.mark.fpgadataflow +def test_pwpolyf_cppsim(func, num_channels, num_input_vecs, fold): + K = 3 + if fold == -1: + fold = num_channels + pe = num_channels // fold + if num_channels % pe != 0: + pytest.skip("Invalid folding configuration.") + + model = make_pwpolyf_modelwrapper(func, K, num_channels, num_input_vecs) + node = model.graph.node[0] + inst = getCustomOp(node) + inst.set_nodeattr("PE", pe) + + input_shape = tuple(num_input_vecs + [num_channels]) + x = np.random.uniform(-10, 10, input_shape).astype(np.float32) + + ref_mod = PiecewisePolyActivation(func, K=K) + with torch.no_grad(): + y_expected = ref_mod(torch.from_numpy(x)).numpy() + + input_dict = {"inp": x} + y_produced = oxe.execute_onnx(model, input_dict)["outp"] + + assert y_produced.shape == y_expected.shape + assert np.allclose(y_produced, y_expected, atol=1e-6) + + +@pytest.mark.parametrize("func", ["gelu", "silu", "sigmoid", "tanh"]) +@pytest.mark.fpgadataflow +def test_pwpolyf_onnx_export(func): + K = 3 + degree = 3 + num_channels = 32 + mod = PiecewisePolyActivation(func, K=K, degree=degree) + mod.eval() + dummy = torch.randn(1, num_channels) + + with tempfile.NamedTemporaryFile(suffix=".onnx", delete=False) as f: + tmpf = f.name + try: + torch.onnx.export( + mod, + dummy, + tmpf, + input_names=["input"], + output_names=["output"], + opset_version=13, + dynamo=False, + ) + import onnx # noqa: PLC0415 + + onnx_model = onnx.load(tmpf) + finally: + os.unlink(tmpf) + + pwp_nodes = [n for n in onnx_model.graph.node if n.op_type == "PWPolyF"] + assert len(pwp_nodes) == 1 + node = pwp_nodes[0] + func_attr = {a.name: a for a in node.attribute} + assert func_attr["func"].s.decode("utf-8") == func + assert func_attr["K"].i == K + assert func_attr["degree"].i == degree + + +@pytest.mark.parametrize("func", ["gelu", "sigmoid"]) +@pytest.mark.fpgadataflow +def test_pwpolyf_infer_transform(func): + K = 3 + degree = 3 + num_channels = 16 + mod = PiecewisePolyActivation(func, K=K, degree=degree) + mod.eval() + dummy = torch.randn(1, num_channels) + + with tempfile.NamedTemporaryFile(suffix=".onnx", delete=False) as f: + tmpf = f.name + try: + torch.onnx.export( + mod, + dummy, + tmpf, + input_names=["inp"], + output_names=["outp"], + opset_version=13, + dynamo=False, + ) + model = ModelWrapper(tmpf) + finally: + os.unlink(tmpf) + + node = model.graph.node[0] + assert node.op_type == "PWPolyF" + assert node.domain != "finn.custom_op.fpgadataflow" + + model = model.transform(InferPWPolyFLayer()) + + node = model.graph.node[0] + assert node.op_type == "PWPolyF" + assert node.domain == "finn.custom_op.fpgadataflow" + + inst = getCustomOp(node) + assert inst.get_nodeattr("func") == func + assert inst.get_nodeattr("K") == K + assert inst.get_nodeattr("degree") == degree + assert inst.get_nodeattr("NumChannels") == num_channels + assert inst.get_nodeattr("PE") == 1 + assert inst.get_nodeattr("inputDataType") == "FLOAT32" + + x = np.random.uniform(-5, 5, (1, num_channels)).astype(np.float32) + input_dict = {"inp": x} + y_produced = oxe.execute_onnx(model, input_dict)["outp"] + + ref_mod = PiecewisePolyActivation(func, K=K, degree=degree) + with torch.no_grad(): + y_expected = ref_mod(torch.from_numpy(x)).numpy() + assert np.allclose(y_produced, y_expected, atol=1e-6) + + +@pytest.mark.parametrize("func", ["gelu", "silu", "sigmoid", "tanh"]) +@pytest.mark.fpgadataflow +def test_pwpolyf_specialize_rtl(func): + K = 3 + num_channels = 8 + model = make_pwpolyf_modelwrapper(func, K, num_channels, [1]) + model = model.transform(SpecializeLayers(test_fpga_part)) + + node = model.graph.node[0] + assert node.op_type == "PWPolyF_rtl" + assert node.domain == "finn.custom_op.fpgadataflow.rtl" + + inst = getCustomOp(node) + assert inst.get_nodeattr("func") == func + assert inst.get_nodeattr("K") == K + + +@pytest.mark.fpgadataflow +def test_pwpolyf_specialize_rejects_non_versal(): + model = make_pwpolyf_modelwrapper("gelu", 3, 8, [1]) + + with pytest.raises(Exception, match="Versal"): + model.transform(SpecializeLayers(non_versal_fpga_part)) + + +@pytest.mark.parametrize("func", ["gelu", "tanh"]) +@pytest.mark.parametrize("pe", [1, 2, 4]) +@pytest.mark.parametrize("degree", [1, 2, 3]) +@pytest.mark.parametrize("K, bram18_per_coeff_rom", [(3, 1), (6, 2)]) +@pytest.mark.fpgadataflow +def test_pwpolyf_resource_estimates(func, pe, degree, K, bram18_per_coeff_rom): + num_channels = 8 + model = make_pwpolyf_modelwrapper(func, K, num_channels, [1]) + node = model.graph.node[0] + inst = getCustomOp(node) + inst.set_nodeattr("PE", pe) + inst.set_nodeattr("degree", degree) + + assert inst.dsp_estimation() == degree * pe + assert inst.lut_estimation() == 100 * degree * pe + assert inst.bram_estimation() == max(degree - 1, 0) * pe * bram18_per_coeff_rom + assert inst.uram_estimation() == 0 + + +@pytest.mark.parametrize("func", ["gelu", "sigmoid"]) +@pytest.mark.fpgadataflow +def test_pwpolyf_folded_shape(func): + K = 3 + num_channels = 12 + num_input_vecs = [1, 3, 3] + model = make_pwpolyf_modelwrapper(func, K, num_channels, num_input_vecs) + node = model.graph.node[0] + inst = getCustomOp(node) + + # PE=1 + assert inst.get_normal_input_shape() == (1, 3, 3, 12) + assert inst.get_normal_output_shape() == (1, 3, 3, 12) + assert inst.get_folded_input_shape() == (1, 3, 3, 12, 1) + assert inst.get_folded_output_shape() == (1, 3, 3, 12, 1) + + # PE=4 + inst.set_nodeattr("PE", 4) + assert inst.get_folded_input_shape() == (1, 3, 3, 3, 4) + assert inst.get_folded_output_shape() == (1, 3, 3, 3, 4) + assert inst.get_instream_width() == 4 * 32 + assert inst.get_outstream_width() == 4 * 32 + + +@pytest.mark.parametrize("func", ["gelu", "silu"]) +@pytest.mark.fpgadataflow +def test_pwpolyf_exp_cycles(func): + """Verify expected cycle count estimation.""" + K = 3 + num_channels = 8 + pe = 2 + num_input_vecs = [1, 4, 4] + model = make_pwpolyf_modelwrapper(func, K, num_channels, num_input_vecs) + node = model.graph.node[0] + inst = getCustomOp(node) + inst.set_nodeattr("PE", pe) + + # folded shape = (1, 4, 4, 4, 2), exp_cycles = prod of all but last = 1*4*4*4 = 64 + exp = inst.get_exp_cycles() + assert exp == 1 * 4 * 4 * (num_channels // pe) + + # exp_cycles_per_layer analysis only runs on specialized (rtl/hls) nodes + model = model.transform(SpecializeLayers(test_fpga_part)) + node = model.graph.node[0] + inst = getCustomOp(node) + inst.set_nodeattr("PE", pe) + exp_dict = model.analysis(exp_cycles_per_layer) + assert node.name in exp_dict + assert exp_dict[node.name] == exp + + +# ---------- helpers for standard ONNX op inference tests ---------- + + +def make_standard_activation_model(op_type, num_channels, num_input_vecs): + """Build an ONNX model with a single standard activation op.""" + shape = num_input_vecs + [num_channels] + inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, shape) + outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, shape) + + act_node = helper.make_node(op_type, ["inp"], ["outp"], name=op_type + "_0") + graph = helper.make_graph([act_node], "test_graph", [inp], [outp]) + model = helper.make_model(graph, producer_name="test") + model.opset_import[0].version = 20 + model = ModelWrapper(model) + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) + return model + + +def make_silu_pattern_model(num_channels, num_input_vecs): + """Build ONNX model with Sigmoid + Mul pattern (SiLU).""" + shape = num_input_vecs + [num_channels] + inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, shape) + outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, shape) + sig_out = helper.make_tensor_value_info("sig_out", TensorProto.FLOAT, shape) + + sigmoid_node = helper.make_node("Sigmoid", ["inp"], ["sig_out"], name="Sigmoid_0") + mul_node = helper.make_node("Mul", ["inp", "sig_out"], ["outp"], name="Mul_0") + + graph = helper.make_graph( + [sigmoid_node, mul_node], + "silu_graph", + [inp], + [outp], + ) + model = helper.make_model(graph, producer_name="test") + model = ModelWrapper(model) + model.graph.value_info.append(sig_out) + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) + return model + + +def make_erf_gelu_model(num_channels, num_input_vecs): + """Build ONNX model with the Erf-based GELU decomposition. + + Pattern: x * 0.5 * (1 + erf(x / sqrt(2))) + Nodes: Div(x, sqrt(2)) -> Erf -> Add(_, 1) -> Mul(0.5, _) -> Mul(x, _) + """ + shape = num_input_vecs + [num_channels] + inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, shape) + outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, shape) + + sqrt2 = helper.make_tensor("sqrt2", TensorProto.FLOAT, [], [np.float32(np.sqrt(2))]) + one = helper.make_tensor("one", TensorProto.FLOAT, [], [np.float32(1.0)]) + half = helper.make_tensor("half", TensorProto.FLOAT, [], [np.float32(0.5)]) + + div_node = helper.make_node("Div", ["inp", "sqrt2"], ["div_out"], name="Div_0") + erf_node = helper.make_node("Erf", ["div_out"], ["erf_out"], name="Erf_0") + add_node = helper.make_node("Add", ["erf_out", "one"], ["add_out"], name="Add_0") + mul_half_node = helper.make_node("Mul", ["half", "add_out"], ["mul_half_out"], name="Mul_0") + mul_x_node = helper.make_node("Mul", ["inp", "mul_half_out"], ["outp"], name="Mul_1") + + graph = helper.make_graph( + [div_node, erf_node, add_node, mul_half_node, mul_x_node], + "erf_gelu_graph", + [inp], + [outp], + initializer=[sqrt2, one, half], + ) + model = helper.make_model(graph, producer_name="test") + model = ModelWrapper(model) + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) + return model + + +# ---------- standard ONNX op inference tests ---------- + + +@pytest.mark.parametrize( + "op_type,expected_func", + [ + ("Gelu", "gelu"), + ("Sigmoid", "sigmoid"), + ("Tanh", "tanh"), + ], +) +@pytest.mark.parametrize("num_channels", [4, 16]) +@pytest.mark.parametrize("num_input_vecs", [[1], [1, 2, 2]]) +@pytest.mark.fpgadataflow +def test_pwpolyf_infer_standard_op(op_type, expected_func, num_channels, num_input_vecs): + model = make_standard_activation_model(op_type, num_channels, num_input_vecs) + + assert model.graph.node[0].op_type == op_type + + model = model.transform(InferPWPolyFLayer()) + + assert len(model.graph.node) == 1 + node = model.graph.node[0] + assert node.op_type == "PWPolyF" + assert node.domain == "finn.custom_op.fpgadataflow" + + inst = getCustomOp(node) + assert inst.get_nodeattr("func") == expected_func + assert inst.get_nodeattr("K") == 3 + assert inst.get_nodeattr("NumChannels") == num_channels + assert inst.get_nodeattr("PE") == 1 + assert inst.get_nodeattr("inputDataType") == "FLOAT32" + + +@pytest.mark.parametrize("num_channels", [4, 16]) +@pytest.mark.parametrize("num_input_vecs", [[1], [1, 2, 2]]) +@pytest.mark.fpgadataflow +def test_pwpolyf_infer_silu_pattern(num_channels, num_input_vecs): + model = make_silu_pattern_model(num_channels, num_input_vecs) + + assert len(model.graph.node) == 2 + assert model.graph.node[0].op_type == "Sigmoid" + assert model.graph.node[1].op_type == "Mul" + + model = model.transform(InferPWPolyFLayer()) + + assert len(model.graph.node) == 1 + node = model.graph.node[0] + assert node.op_type == "PWPolyF" + assert node.domain == "finn.custom_op.fpgadataflow" + + inst = getCustomOp(node) + assert inst.get_nodeattr("func") == "silu" + assert inst.get_nodeattr("K") == 3 + assert inst.get_nodeattr("NumChannels") == num_channels + + +@pytest.mark.fpgadataflow +def test_pwpolyf_infer_silu_reversed_mul_inputs(): + """SiLU detection works regardless of Mul input order.""" + num_channels = 8 + shape = [1, num_channels] + inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, shape) + outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, shape) + sig_out = helper.make_tensor_value_info("sig_out", TensorProto.FLOAT, shape) + + sigmoid_node = helper.make_node("Sigmoid", ["inp"], ["sig_out"], name="Sigmoid_0") + mul_node = helper.make_node("Mul", ["sig_out", "inp"], ["outp"], name="Mul_0") + + graph = helper.make_graph([sigmoid_node, mul_node], "silu_graph", [inp], [outp]) + model = helper.make_model(graph, producer_name="test") + model = ModelWrapper(model) + model.graph.value_info.append(sig_out) + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) + + model = model.transform(InferPWPolyFLayer()) + + assert len(model.graph.node) == 1 + inst = getCustomOp(model.graph.node[0]) + assert inst.get_nodeattr("func") == "silu" + + +@pytest.mark.fpgadataflow +def test_pwpolyf_sigmoid_multi_consumer_no_silu(): + """Sigmoid with multiple consumers becomes standalone sigmoid, not silu.""" + num_channels = 8 + shape = [1, num_channels] + inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, shape) + outp1 = helper.make_tensor_value_info("outp1", TensorProto.FLOAT, shape) + outp2 = helper.make_tensor_value_info("outp2", TensorProto.FLOAT, shape) + sig_out = helper.make_tensor_value_info("sig_out", TensorProto.FLOAT, shape) + + sigmoid_node = helper.make_node("Sigmoid", ["inp"], ["sig_out"], name="Sigmoid_0") + mul_node = helper.make_node("Mul", ["inp", "sig_out"], ["outp1"], name="Mul_0") + identity_node = helper.make_node("Identity", ["sig_out"], ["outp2"], name="Id_0") + + graph = helper.make_graph( + [sigmoid_node, mul_node, identity_node], + "test_graph", + [inp], + [outp1, outp2], + ) + model = helper.make_model(graph, producer_name="test") + model = ModelWrapper(model) + model.graph.value_info.append(sig_out) + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) + + model = model.transform(InferPWPolyFLayer()) + + pwp_nodes = [n for n in model.graph.node if n.op_type == "PWPolyF"] + assert len(pwp_nodes) == 1 + inst = getCustomOp(pwp_nodes[0]) + assert inst.get_nodeattr("func") == "sigmoid" + # Mul and Identity should remain + assert any(n.op_type == "Mul" for n in model.graph.node) + assert any(n.op_type == "Identity" for n in model.graph.node) + + +@pytest.mark.parametrize( + "op_type,expected_func", + [ + ("Gelu", "gelu"), + ("Sigmoid", "sigmoid"), + ("Tanh", "tanh"), + ], +) +@pytest.mark.fpgadataflow +def test_pwpolyf_standard_op_execution(op_type, expected_func): + num_channels = 16 + model = make_standard_activation_model(op_type, num_channels, [1]) + model = model.transform(InferPWPolyFLayer()) + + x = np.random.uniform(-5, 5, (1, num_channels)).astype(np.float32) + y_produced = oxe.execute_onnx(model, {"inp": x})["outp"] + + ref_mod = PiecewisePolyActivation(expected_func, K=3) + with torch.no_grad(): + y_expected = ref_mod(torch.from_numpy(x)).numpy() + assert np.allclose(y_produced, y_expected, atol=1e-6) + + +@pytest.mark.fpgadataflow +def test_pwpolyf_silu_pattern_execution(): + num_channels = 16 + model = make_silu_pattern_model(num_channels, [1]) + model = model.transform(InferPWPolyFLayer()) + + x = np.random.uniform(-5, 5, (1, num_channels)).astype(np.float32) + y_produced = oxe.execute_onnx(model, {"inp": x})["outp"] + + ref_mod = PiecewisePolyActivation("silu", K=3) + with torch.no_grad(): + y_expected = ref_mod(torch.from_numpy(x)).numpy() + assert np.allclose(y_produced, y_expected, atol=1e-6) + + +# ---------- Erf-based GELU inference tests ---------- + + +@pytest.mark.parametrize("num_channels", [4, 16]) +@pytest.mark.parametrize("num_input_vecs", [[1], [1, 2, 2]]) +@pytest.mark.fpgadataflow +def test_pwpolyf_infer_erf_gelu_pattern(num_channels, num_input_vecs): + """Erf-based GELU decomposition (opset < 20) is converted to PWPolyF.""" + model = make_erf_gelu_model(num_channels, num_input_vecs) + + assert len(model.graph.node) == 5 + assert model.graph.node[1].op_type == "Erf" + + model = model.transform(InferPWPolyFLayer()) + + assert len(model.graph.node) == 1 + node = model.graph.node[0] + assert node.op_type == "PWPolyF" + assert node.domain == "finn.custom_op.fpgadataflow" + + inst = getCustomOp(node) + assert inst.get_nodeattr("func") == "gelu" + assert inst.get_nodeattr("K") == 3 + assert inst.get_nodeattr("NumChannels") == num_channels + assert inst.get_nodeattr("PE") == 1 + assert inst.get_nodeattr("inputDataType") == "FLOAT32" + + +@pytest.mark.fpgadataflow +def test_pwpolyf_erf_gelu_execution(): + """Erf-based GELU produces same output as PiecewisePolyActivation.""" + num_channels = 16 + model = make_erf_gelu_model(num_channels, [1]) + model = model.transform(InferPWPolyFLayer()) + + x = np.random.uniform(-5, 5, (1, num_channels)).astype(np.float32) + y_produced = oxe.execute_onnx(model, {"inp": x})["outp"] + + ref_mod = PiecewisePolyActivation("gelu", K=3) + with torch.no_grad(): + y_expected = ref_mod(torch.from_numpy(x)).numpy() + assert np.allclose(y_produced, y_expected, atol=1e-6) + + +# ---------- coefficient package smoketests ---------- + + +@pytest.mark.parametrize("K", [2, 3, 4]) +@pytest.mark.fpgadataflow +def test_pwpolyf_generate_coeffs_pkg(K): + """Verify PWPolyF_rtl coefficient generation produces valid SystemVerilog.""" + pkg = make_pwpolyf_rtl_inst(K=K)._generate_coeffs_pkg() + + assert "package pwpolyf_pkg" in pkg + assert "endpackage" in pkg + # localparam lines use padded alignment in the generated SV + assert "DEGREE = 2;" in pkg + assert "K = %d;" % K in pkg + + num_segs = 1 + 2 * 5 * (1 << K) + assert "NUM_SEGS = %d;" % num_segs in pkg + + for func_label in ["GELU", "SILU", "SIGMOID", "TANH"]: + assert func_label + " = '{" in pkg + + seg_lines = [line for line in pkg.split("\n") if "// seg" in line] + # Each function has num_segs segments, 4 functions total + assert len(seg_lines) == 4 * num_segs + + +@pytest.mark.parametrize("degree", [1, 2, 3]) +@pytest.mark.fpgadataflow +def test_pwpolyf_generate_coeffs_pkg_degree(degree): + """Verify PWPolyF_rtl coefficient generation respects degree parameter.""" + K = 3 + pkg = make_pwpolyf_rtl_inst(K=K, degree=degree)._generate_coeffs_pkg() + + assert "DEGREE = %d;" % degree in pkg + # Each segment line should have degree+1 coefficient values + seg_lines = [line for line in pkg.split("\n") if "// seg 0" in line] + for line in seg_lines: + hex_vals = [s for s in line.split() if s.startswith("32'h")] + assert len(hex_vals) == degree + 1 + + +# ---------- generate_hdl smoketests ---------- + + +@pytest.mark.parametrize("func", ["gelu", "tanh"]) +@pytest.mark.parametrize("pe", [1, 2]) +@pytest.mark.fpgadataflow +@pytest.mark.vivado +def test_pwpolyf_generate_hdl(func, pe): + """Verify generate_hdl produces expected RTL files.""" + num_channels = 4 + model = make_pwpolyf_modelwrapper(func, 3, num_channels, [1]) + model = model.transform(SpecializeLayers(test_fpga_part)) + model = model.transform(GiveUniqueNodeNames()) + + node = model.graph.node[0] + assert node.op_type == "PWPolyF_rtl" + inst = getCustomOp(node) + inst.set_nodeattr("PE", pe) + + model = model.transform(PrepareIP(test_fpga_part, target_clk_ns)) + + # Re-fetch node after transform (PrepareIP returns a new model) + node = model.graph.node[0] + inst = getCustomOp(node) + + code_gen_dir = inst.get_nodeattr("code_gen_dir_ipgen") + assert code_gen_dir, "code_gen_dir_ipgen not set after PrepareIP" + assert os.path.isfile(os.path.join(code_gen_dir, "pwpolyf_pkg.sv")) + assert os.path.isfile(os.path.join(code_gen_dir, "pwpolyf.sv")) + assert os.path.isfile(os.path.join(code_gen_dir, "queue.sv")) + + topname = inst.get_nodeattr("gen_top_module") + assert os.path.isfile(os.path.join(code_gen_dir, topname + ".v")) + + # Verify package content + with open(os.path.join(code_gen_dir, "pwpolyf_pkg.sv"), "r") as f: + pkg_content = f.read() + assert "DEGREE = 2;" in pkg_content + assert "K = 3;" in pkg_content + assert func.upper() + " = '{" in pkg_content + + +# ---------- RTL simulation tests ---------- + + +@pytest.mark.parametrize("func", ["gelu", "sigmoid"]) +@pytest.mark.parametrize("num_channels", [4, 8]) +@pytest.mark.parametrize("pe", [1, 2, 4]) +@pytest.mark.fpgadataflow +@pytest.mark.vivado +@pytest.mark.slow +def test_pwpolyf_rtlsim(func, num_channels, pe): + """Node-by-node RTL simulation of PWPolyF_rtl.""" + if num_channels % pe != 0: + pytest.skip("PE does not divide NumChannels") + + K = 3 + model = make_pwpolyf_modelwrapper(func, K, num_channels, [1]) + + # Get cppsim reference output + x = np.random.uniform(-5, 5, (1, num_channels)).astype(np.float32) + input_dict = {"inp": x} + y_ref = oxe.execute_onnx(model, input_dict)["outp"] + + # Specialize to RTL and set PE + model = model.transform(SpecializeLayers(test_fpga_part)) + model = model.transform(GiveUniqueNodeNames()) + node = model.graph.node[0] + assert node.op_type == "PWPolyF_rtl" + inst = getCustomOp(node) + inst.set_nodeattr("PE", pe) + + # RTL simulation pipeline + model = model.transform(SetExecMode("rtlsim")) + model = model.transform(PrepareIP(test_fpga_part, target_clk_ns)) + model = model.transform(HLSSynthIP()) + model = model.transform(PrepareRTLSim()) + + y_rtl = oxe.execute_onnx(model, input_dict)["outp"] + assert np.allclose(y_ref, y_rtl, atol=1e-4), "RTL output does not match cppsim reference" + + # Verify cycle count (re-fetch node after transforms) + node = model.graph.node[0] + inst = getCustomOp(node) + cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim") + exp_cycles_dict = model.analysis(exp_cycles_per_layer) + exp_cycles = exp_cycles_dict[node.name] + assert np.isclose(exp_cycles, cycles_rtlsim, atol=15) + assert exp_cycles != 0 + + +@pytest.mark.parametrize("func", ["gelu", "sigmoid"]) +@pytest.mark.parametrize("pe", [1, 2]) +@pytest.mark.fpgadataflow +@pytest.mark.vivado +@pytest.mark.slow +def test_pwpolyf_rtlsim_stitched_ip(func, pe): + """Stitched IP RTL simulation of PWPolyF_rtl.""" + K = 3 + num_channels = 4 + model = make_pwpolyf_modelwrapper(func, K, num_channels, [1]) + + # Get cppsim reference output + x = np.random.uniform(-5, 5, (1, num_channels)).astype(np.float32) + input_dict = {model.graph.input[0].name: x} + y_ref = oxe.execute_onnx(model, input_dict)[model.graph.output[0].name] + + # Specialize to RTL and set PE + model = model.transform(SpecializeLayers(test_fpga_part)) + model = model.transform(GiveUniqueNodeNames()) + node = model.graph.node[0] + inst = getCustomOp(node) + inst.set_nodeattr("PE", pe) + + # Stitched IP pipeline + model = model.transform(InsertAndSetFIFODepths(test_fpga_part, target_clk_ns)) + model = model.transform(PrepareIP(test_fpga_part, target_clk_ns)) + model = model.transform(HLSSynthIP()) + model = model.transform(CreateStitchedIP(test_fpga_part, target_clk_ns)) + model.set_metadata_prop("exec_mode", "rtlsim") + + input_dict = {model.graph.input[0].name: x} + y_rtl = oxe.execute_onnx(model, input_dict)[model.graph.output[0].name] + assert np.allclose( + y_ref, y_rtl, atol=1e-4 + ), "Stitched IP output does not match cppsim reference"