From 1d1399046d0cf61b00082352e75f814d0ff15cf3 Mon Sep 17 00:00:00 2001 From: ollycassidy13 Date: Wed, 22 Apr 2026 11:43:10 +0100 Subject: [PATCH 01/12] pwpolyf initial integration (missing dynamo and nn.act) (hw stub) --- docs/finn/internals.rst | 3 + docs/finn/pwpolyf.md | 123 ++++++ .../finn.custom_op.fpgadataflow.rst | 9 + .../finn.custom_op.fpgadataflow.rtl.rst | 8 + docs/finn/source_code/finn.util.rst | 8 + finn-rtllib/pwpolyf/hdl/pwpolyf.sv | 356 ++++++++++++++++++ finn-rtllib/pwpolyf/hdl/pwpolyf_coeffs.svh | 344 +++++++++++++++++ .../pwpolyf/hdl/pwpolyf_template_wrapper.v | 69 ++++ finn-rtllib/pwpolyf/hdl/queue.sv | 78 ++++ src/finn/builder/build_dataflow_steps.py | 2 + src/finn/custom_op/fpgadataflow/__init__.py | 2 + src/finn/custom_op/fpgadataflow/pwpolyf.py | 187 +++++++++ .../custom_op/fpgadataflow/rtl/__init__.py | 2 + .../custom_op/fpgadataflow/rtl/pwpolyf_rtl.py | 289 ++++++++++++++ .../fpgadataflow/convert_to_hw_layers.py | 47 +++ .../fpgadataflow/set_folding.py | 1 + src/finn/util/pwpolyf.py | 236 ++++++++++++ .../fpgadataflow/test_fpgadataflow_pwpolyf.py | 281 ++++++++++++++ 18 files changed, 2045 insertions(+) create mode 100644 docs/finn/pwpolyf.md create mode 100644 finn-rtllib/pwpolyf/hdl/pwpolyf.sv create mode 100644 finn-rtllib/pwpolyf/hdl/pwpolyf_coeffs.svh create mode 100644 finn-rtllib/pwpolyf/hdl/pwpolyf_template_wrapper.v create mode 100755 finn-rtllib/pwpolyf/hdl/queue.sv create mode 100644 src/finn/custom_op/fpgadataflow/pwpolyf.py create mode 100644 src/finn/custom_op/fpgadataflow/rtl/pwpolyf_rtl.py create mode 100644 src/finn/util/pwpolyf.py create mode 100644 tests/fpgadataflow/test_fpgadataflow_pwpolyf.py diff --git a/docs/finn/internals.rst b/docs/finn/internals.rst index 0fd6c42350..438e64b077 100644 --- a/docs/finn/internals.rst +++ b/docs/finn/internals.rst @@ -247,6 +247,9 @@ Constraints to folding factors per layer * - Pool - PE - inp_channels % PE == 0 + * - PWPolyF + - PE + - NumChannels % PE == 0 * - Thresholding - PE - MH % PE == 0 diff --git a/docs/finn/pwpolyf.md b/docs/finn/pwpolyf.md new file mode 100644 index 0000000000..c155470bae --- /dev/null +++ b/docs/finn/pwpolyf.md @@ -0,0 +1,123 @@ +# PWPolyF — Piecewise Polynomial Activation + +## Overview + +PWPolyF is a hardware activation layer that approximates nonlinear functions +(GELU, SiLU, Sigmoid, Tanh) using degree-2 piecewise polynomials. Each segment +is evaluated via Horner's method on two cascaded DSPFP32 FMA units, giving +single-cycle-per-element throughput with no BRAM usage. + +The input domain is partitioned into `1 + 2*5*(2^K)` segments: one near-zero +region, positive octave sub-segments, and negative mirrors. With the default +K=3 this gives 81 segments. Segment selection reuses the FP32 +exponent/mantissa bit-fields directly, matching the RTL implementation. + +Polynomial coefficients are generated at HDL build time by +`generate_coeffs_svh()` in `pwpolyf_sim.py`, which fits degree-2 polynomials +to the reference PyTorch functions and writes the `pwpolyf_coeffs.svh` header. +This ensures the RTL coefficients always match the configured K value. + +> **Note:** The RTL currently only supports K=3. Support for other K values +> is planned for a future update to `pwpolyf.sv`. + +## Architecture + +PWPolyF is **RTL-only** (no HLS variant). The pipeline is: + +``` +PiecewisePolyActivation (PyTorch) + | torch.onnx.export (dynamo=False) + v +PWPolyF ONNX node + | InferPWPolyFLayer + v +PWPolyF HW op (finn.custom_op.fpgadataflow) + | SpecializeLayers + v +PWPolyF_rtl (finn.custom_op.fpgadataflow.rtl) + | generate_hdl + v +finn-rtllib/pwpolyf/hdl/ SystemVerilog IP +``` + +## Folding + +PWPolyF uses PE parallelism. `NumChannels % PE == 0` must hold. +Each PE instantiates its own polynomial evaluation pipeline (2 DSPs). +`SetFolding` handles PE selection automatically. + +| PE | DSPs | Approx LUTs | Cycles (per spatial position) | +|----|------|-------------|-------------------------------| +| 1 | 2 | 200 | NumChannels | +| C | 2C | 200C | 1 | + +## Resource estimates + +- **DSP:** 2 per PE (two FP32 FMA stages) +- **LUT:** ~200 per PE (segment address decode + control) +- **BRAM/URAM:** 0 (coefficients stored in LUT/registers) + +## ONNX export + +`PiecewisePolyActivation` exports as a single `PWPolyF` custom op via +`torch.autograd.Function.symbolic()`. Requires the legacy TorchScript exporter +(`dynamo=False` in `torch.onnx.export`). + +Attributes on the ONNX node: +- `func` (string): one of `gelu`, `silu`, `sigmoid`, `tanh` +- `K` (int): mantissa subdivision bits (default 3) + +## Node attributes (HW op) + +| Attribute | Type | Description | +|--------------------|--------|------------------------------------------| +| `func` | string | Activation function name | +| `K` | int | Mantissa subdivision bits | +| `NumChannels` | int | Number of channels (last input dim) | +| `PE` | int | Processing elements | +| `inputDataType` | string | Input data type (FLOAT32) | +| `outputDataType` | string | Output data type (FLOAT32) | +| `numInputVectors` | ints | Batch/spatial dimensions | + +## Supported functions + +| Function | Negative clamp | Positive behaviour | +|----------|---------------|--------------------| +| GELU | 0.0 | passthrough (y=x) | +| SiLU | 0.0 | passthrough (y=x) | +| Sigmoid | 0.0 | clamp to 1.0 | +| Tanh | -1.0 | clamp to 1.0 | + +## Files + +### Python + +| File | Purpose | +|------|---------| +| `custom_op/fpgadataflow/pwpolyf.py` | Base HW op (shape, folding, resource estimates, cppsim) | +| `custom_op/fpgadataflow/rtl/pwpolyf_rtl.py` | RTL backend (HDL generation, coefficient SVH generation, rtlsim, IPI) | +| `util/pwpolyf.py` | PyTorch activation module, ONNX export, software simulation | +| `transformation/fpgadataflow/convert_to_hw_layers.py` | `InferPWPolyFLayer` transformation | +| `builder/build_dataflow_steps.py` | Build pipeline integration | +| `transformation/fpgadataflow/set_folding.py` | Folding support (pe_ops list) | + +### RTL + +| File | Purpose | +|------|---------| +| `finn-rtllib/pwpolyf/hdl/pwpolyf.sv` | Core polynomial evaluation pipeline | +| `finn-rtllib/pwpolyf/hdl/pwpolyf_coeffs.svh` | Default K=3 coefficients (regenerated at build time) | +| `finn-rtllib/pwpolyf/hdl/queue.sv` | Elastic FIFO for backpressure | +| `finn-rtllib/pwpolyf/hdl/pwpolyf_template_wrapper.v` | AXI-Stream wrapper template | + +## Tests + +`tests/fpgadataflow/test_fpgadataflow_pwpolyf.py` — 68 parametrized tests: + +- **cppsim**: all 4 functions x 2 channel counts x 2 spatial shapes x 3 foldings +- **ONNX export**: verifies single-node export for all functions +- **InferPWPolyFLayer**: end-to-end export → transform → execute +- **SpecializeLayers**: verifies RTL specialization +- **Resource estimates**: DSP/LUT/BRAM checks across PE values +- **Folded shapes**: input/output/stream width calculations +- **Expected cycles**: cycle count estimation + analysis pass integration diff --git a/docs/finn/source_code/finn.custom_op.fpgadataflow.rst b/docs/finn/source_code/finn.custom_op.fpgadataflow.rst index 25aafc324e..7660ea6dd3 100644 --- a/docs/finn/source_code/finn.custom_op.fpgadataflow.rst +++ b/docs/finn/source_code/finn.custom_op.fpgadataflow.rst @@ -144,6 +144,15 @@ finn.custom\_op.fpgadataflow.pool :undoc-members: :show-inheritance: +finn.custom\_op.fpgadataflow.pwpolyf +-------------------------------------- + +.. automodule:: finn.custom_op.fpgadataflow.pwpolyf + :members: + :undoc-members: + :show-inheritance: + + finn.custom\_op.fpgadataflow.streamingdataflowpartition -------------------------------------------------------- diff --git a/docs/finn/source_code/finn.custom_op.fpgadataflow.rtl.rst b/docs/finn/source_code/finn.custom_op.fpgadataflow.rtl.rst index 346eddb073..e31176462f 100644 --- a/docs/finn/source_code/finn.custom_op.fpgadataflow.rtl.rst +++ b/docs/finn/source_code/finn.custom_op.fpgadataflow.rtl.rst @@ -45,6 +45,14 @@ finn.custom\_op.fpgadataflow.streamingfifo\_rtl :undoc-members: :show-inheritance: +finn.custom\_op.fpgadataflow.pwpolyf\_rtl +-------------------------------------------- + +.. automodule:: finn.custom_op.fpgadataflow.rtl.pwpolyf_rtl + :members: + :undoc-members: + :show-inheritance: + finn.custom\_op.fpgadataflow.thresholding\_rtl ------------------------------------------------------- diff --git a/docs/finn/source_code/finn.util.rst b/docs/finn/source_code/finn.util.rst index 2ec1502441..fb9b8ddfff 100644 --- a/docs/finn/source_code/finn.util.rst +++ b/docs/finn/source_code/finn.util.rst @@ -171,6 +171,14 @@ finn.util.pytorch :show-inheritance: +finn.util.pwpolyf +------------------- + +.. automodule:: finn.util.pwpolyf + :members: + :undoc-members: + :show-inheritance: + finn.util.pyverilator --------------------- diff --git a/finn-rtllib/pwpolyf/hdl/pwpolyf.sv b/finn-rtllib/pwpolyf/hdl/pwpolyf.sv new file mode 100644 index 0000000000..51196a9db6 --- /dev/null +++ b/finn-rtllib/pwpolyf/hdl/pwpolyf.sv @@ -0,0 +1,356 @@ +/**************************************************************************** + * Copyright (C) 2026, Advanced Micro Devices, Inc. + * All rights reserved. + * + * SPDX-License-Identifier: BSD-3-Clause + * + * @brief FP32 piecewise polynomial activation on DSPFP32. + * @author Shane Fleming + * + * @description + * Supports GELU, SiLU, Sigmoid, and Tanh via `parameter string FUNC`. + * + * Approximated by piecewise degree-2 polynomials over segments defined + * by FP32 bit-extraction. Evaluated via Horner's method on a chain of + * 2 DSPFP32 instances, each computing FMA: out = C + A*B. + * + * Horner: y = a_0 + x*(a_1 + a_2*x) + * Stage 0: out = a_1 + a_2 * x (A=coeff[2], B=x, C=coeff[1]) + * Stage 1: out = a_0 + prev * x (A=prev, B=x, C=coeff[0]) + * + * Clamping for |x| >= 8 (5 octaves): + * GELU/SiLU: neg -> 0, pos -> x (pass-through) + * Sigmoid: neg -> 0, pos -> 1.0 + * Tanh: neg -> -1, pos -> 1.0 + * + * Latency: 8 cycles (2 DSP stages x 4 cycles each). II=1. + ***************************************************************************/ + +//===----------------------------------------------------------------------===// +// Single DSPFP32 FMA wrapper: r = c + a * b +//===----------------------------------------------------------------------===// +module pwpolyf_dspfp32 ( + input logic clk, + input logic rst, + + input logic [31:0] a, + input logic [31:0] b, + input logic [31:0] c, + + output logic [31:0] r, + input logic rvld +); + + // FMA opmode: FPA_OUT = C + A*B + // FPOPMODE[6:5] = 00 (no sign flip on C or M) + // FPOPMODE[4:2] = 110 (select C for W mux, M for Z mux -- add path) + // FPOPMODE[1:0] = 01 (FP mode enable) + localparam logic [6:0] MODE_FMA = 7'b00_110_01; + + logic invalid; + logic overflow; + logic underflow; + + DSPFP32 #( + .A_FPTYPE("B32"), + .A_INPUT("DIRECT"), + .BCASCSEL("B"), + .B_D_FPTYPE("B32"), + .B_INPUT("DIRECT"), + .PCOUTSEL("FPA"), + .USE_MULT("MULTIPLY"), + .IS_CLK_INVERTED(1'b0), + .IS_FPINMODE_INVERTED(1'b0), + .IS_FPOPMODE_INVERTED(7'b0000000), + .IS_RSTA_INVERTED(1'b0), + .IS_RSTB_INVERTED(1'b0), + .IS_RSTC_INVERTED(1'b0), + .IS_RSTD_INVERTED(1'b0), + .IS_RSTFPA_INVERTED(1'b0), + .IS_RSTFPINMODE_INVERTED(1'b0), + .IS_RSTFPMPIPE_INVERTED(1'b0), + .IS_RSTFPM_INVERTED(1'b0), + .IS_RSTFPOPMODE_INVERTED(1'b0), + .ACASCREG(1), + .AREG(1), + .FPA_PREG(1), + .FPBREG(1), + .FPCREG(3), // C needs 3 pipeline stages to align with M output + .FPDREG(0), + .FPMPIPEREG(1), + .FPM_PREG(1), + .FPOPMREG(0), + .INMODEREG(0), + .RESET_MODE("SYNC") + ) DSPFP32_inst ( + .ACOUT_EXP(), .ACOUT_MAN(), .ACOUT_SIGN(), + .BCOUT_EXP(), .BCOUT_MAN(), .BCOUT_SIGN(), + .PCOUT(), + .FPM_INVALID(), .FPM_OVERFLOW(), .FPM_UNDERFLOW(), .FPM_OUT(), + .FPA_INVALID(invalid), .FPA_OVERFLOW(overflow), .FPA_UNDERFLOW(underflow), .FPA_OUT(r), + .ACIN_EXP('x), .ACIN_MAN('x), .ACIN_SIGN('x), + .BCIN_EXP('x), .BCIN_MAN('x), .BCIN_SIGN('x), + .PCIN('x), + .CLK(clk), + .FPINMODE('1), // Select B path (not D) + .FPOPMODE(MODE_FMA), + .A_SIGN(a[31]), .A_EXP(a[30:23]), .A_MAN(a[22:0]), + .B_SIGN(b[31]), .B_EXP(b[30:23]), .B_MAN(b[22:0]), + .C(c), + .D_SIGN('x), .D_EXP('x), .D_MAN('x), + .ASYNC_RST('0), + .CEA1('0), .CEA2('1), + .CEB('1), .CEC('1), .CED('0), + .CEFPA('1), .CEFPINMODE('0), .CEFPM('1), .CEFPMPIPE('1), .CEFPOPMODE('0), + .RSTA('0), .RSTB('0), .RSTC('0), .RSTD('0), + .RSTFPA('0), .RSTFPINMODE('0), .RSTFPM('0), .RSTFPMPIPE('0), .RSTFPOPMODE('0) + ); + + // Simulation-time warnings + always_ff @(posedge clk) begin + if(!rst && rvld) begin + assert(!invalid) else $warning("%m generated invalid output."); + assert(!overflow) else $warning("%m generated an overflow."); + assert(!underflow) else $warning("%m generated an underflow."); + end + end + +endmodule : pwpolyf_dspfp32 + +//===----------------------------------------------------------------------===// +// Full PE-wide streaming activation with piecewise polynomial approximation. +// Hardcoded for DEGREE=2 from pwpolyf_coeffs.svh. +//===----------------------------------------------------------------------===// +module pwpolyf #( + int unsigned PE = 1, + string FUNC = "gelu" +)( + // Global Control + input logic clk, + input logic rst, + + // Input Stream - PE elements wide + input logic [PE-1:0][31:0] xdat, + input logic xvld, + output logic xrdy, + + // Output Stream - PE elements wide + output logic [PE-1:0][31:0] ydat, + output logic yvld, + input logic yrdy +); + + `include "pwpolyf_coeffs.svh" + + localparam int unsigned K = PWPOLYF_K; + localparam int unsigned NUM_SEGS = PWPOLYF_NUM_SEGS; + localparam int unsigned NUM_SUBS = 1 << K; + localparam int unsigned NUM_OCTAVES = PWPOLYF_NUM_OCTAVES; + localparam int unsigned DSP_LAT = 4; + localparam int unsigned LATENCY = 2 * DSP_LAT; // DEGREE=2 + + initial begin + assert(PWPOLYF_DEGREE == 2) else begin + $error("%m: This implementation requires PWPOLYF_DEGREE == 2."); + $finish; + end + assert(FUNC == "gelu" || FUNC == "silu" || FUNC == "sigmoid" || FUNC == "tanh") else begin + $error("%m: Unsupported FUNC=\"%s\". Must be gelu|silu|sigmoid|tanh.", FUNC); + $finish; + end + end + + //=== Per-activation clamping parameters ================================== + localparam logic [31:0] NEG_CLAMP_VAL = + FUNC == "tanh" ? 32'hBF800000 : 32'h00000000; // tanh: -1.0, else: 0.0 + localparam logic [31:0] POS_CLAMP_VAL = + (FUNC == "sigmoid" || FUNC == "tanh") ? 32'h3F800000 : 32'h00000000; // sigmoid/tanh: 1.0 + localparam bit POS_PASSTHROUGH = + (FUNC == "gelu" || FUNC == "silu") ? 1 : 0; // gelu/silu: output=x + + //=== Coefficient selection =============================================== + localparam logic [31:0] COEFFS[NUM_SEGS][3] = + FUNC == "gelu" ? PWPOLYF_GELU_COEFFS : + FUNC == "silu" ? PWPOLYF_SILU_COEFFS : + FUNC == "sigmoid" ? PWPOLYF_SIGMOID_COEFFS : + PWPOLYF_TANH_COEFFS; + + //=== Clamping exponent threshold ========================================= + localparam int unsigned EXP_CLAMP = 130; // |x| >= 8.0 + + //=== Input Sidestep Register ============================================= + typedef logic [PE-1:0][31:0] fp_vec_t; + + uwire take; + + typedef struct { + fp_vec_t val; + logic rdy; + } ibuf_t; + ibuf_t Ibuf = '{ val: 'x, rdy: '1 }; + always_ff @(posedge clk) begin + if(rst) + Ibuf <= '{ val: 'x, rdy: '1 }; + else begin + if(Ibuf.rdy) Ibuf.val <= xdat; + Ibuf.rdy <= (Ibuf.rdy && !xvld) || take; + end + end + assign xrdy = Ibuf.rdy; + uwire fp_vec_t x_cur = Ibuf.rdy? xdat : Ibuf.val; + + //=== Credit-based Operation Issue ======================================== + localparam int unsigned CREDIT = LATENCY + 3; // pipeline + sidestep + queue read + logic signed [$clog2(CREDIT):0] Credit = -CREDIT; + uwire give = yvld && yrdy; + assign take = (xvld || !xrdy) && Credit[$left(Credit)]; + always_ff @(posedge clk) begin + if(rst) Credit <= -CREDIT; + else Credit <= Credit + (give == take? 0 : give? -1 : 1); + end + + //=== Per-PE Compute Pipeline ============================================= + uwire fp_vec_t r; + uwire [PE-1:0] rvld_vec; + uwire rvld; + + for(genvar pe = 0; pe < PE; pe++) begin : genPE + uwire [31:0] xi = x_cur[pe]; + + //--- Segment selector (combinational) -------------------------------- + uwire sign = xi[31]; + uwire [7:0] exp_bits = xi[30:23]; + uwire [K-1:0] sub = xi[22:23-K]; + + // Octave index: exp 125->0, 126->1, 127->2, 128->3, 129->4 + uwire [2:0] octave = exp_bits - 8'd125; + + // Classify + uwire is_near_zero = (exp_bits < 8'd125); + uwire is_pos_clamp = !sign && (exp_bits >= EXP_CLAMP); + uwire is_neg_clamp = sign && (exp_bits >= EXP_CLAMP); + + // Segment index for ROM lookup + uwire [6:0] seg_idx; + if(1) begin : blkSegIdx + uwire [6:0] pos_idx = 7'd1 + {1'b0, octave, sub}; + uwire [6:0] neg_idx = 7'(7'd1 + NUM_SUBS * NUM_OCTAVES) + {1'b0, octave, sub}; + assign seg_idx = is_near_zero? 7'd0 : + sign? neg_idx : pos_idx; + end : blkSegIdx + + //--- Coefficient lookup (combinational) ------------------------------ + uwire [31:0] coeff_a0 = COEFFS[seg_idx][0]; + uwire [31:0] coeff_a1 = COEFFS[seg_idx][1]; + uwire [31:0] coeff_a2 = COEFFS[seg_idx][2]; + + //--- Horner chain: 2 stages of pwpolyf_dspfp32 ---------------------- + // Stage 0: s0 = a1 + a2 * x (latency: 4 cycles) + // Stage 1: s1 = a0 + s0 * x (latency: 4 cycles) + // Total: 8 cycles + + // Valid pipeline + logic [LATENCY-1:0] Vld = '0; + always_ff @(posedge clk) begin + if(rst) Vld <= '0; + else Vld <= { Vld[$left(Vld)-1:0], take }; + end + assign rvld_vec[pe] = Vld[$left(Vld)]; + + // Delay x by 4 cycles for stage 1 input + logic [31:0] Xd1 = 'x; + logic [31:0] Xd2 = 'x; + logic [31:0] Xd3 = 'x; + logic [31:0] Xd4 = 'x; + always_ff @(posedge clk) begin + Xd1 <= xi; + Xd2 <= Xd1; + Xd3 <= Xd2; + Xd4 <= Xd3; + end + + // Delay x by 8 cycles for pass-through on positive clamp + logic [31:0] Xd5 = 'x; + logic [31:0] Xd6 = 'x; + logic [31:0] Xd7 = 'x; + logic [31:0] Xd8 = 'x; + always_ff @(posedge clk) begin + Xd5 <= Xd4; + Xd6 <= Xd5; + Xd7 <= Xd6; + Xd8 <= Xd7; + end + + // Delay a0 by 4 cycles for stage 1 C input + logic [31:0] C0d1 = 'x; + logic [31:0] C0d2 = 'x; + logic [31:0] C0d3 = 'x; + logic [31:0] C0d4 = 'x; + always_ff @(posedge clk) begin + C0d1 <= coeff_a0; + C0d2 <= C0d1; + C0d3 <= C0d2; + C0d4 <= C0d3; + end + + // Stage 0: s0 = coeff_a1 + coeff_a2 * xi + uwire [31:0] s0; + pwpolyf_dspfp32 dsp0 ( + .clk, .rst, + .a(coeff_a2), .b(xi), .c(coeff_a1), + .r(s0), .rvld(Vld[3]) + ); + + // Stage 1: s1 = a0_delayed + s0 * x_delayed + uwire [31:0] s1; + pwpolyf_dspfp32 dsp1 ( + .clk, .rst, + .a(s0), .b(Xd4), .c(C0d4), + .r(s1), .rvld(Vld[7]) + ); + + //--- Clamp mux ------------------------------------------------------- + logic [LATENCY-1:0] NegClamp = '0; + logic [LATENCY-1:0] PosClamp = '0; + always_ff @(posedge clk) begin + if(rst) begin + NegClamp <= '0; + PosClamp <= '0; + end + else begin + NegClamp <= { NegClamp[$left(NegClamp)-1:0], is_neg_clamp }; + PosClamp <= { PosClamp[$left(PosClamp)-1:0], is_pos_clamp }; + end + end + + // Output mux + assign r[pe] = NegClamp[$left(NegClamp)]? NEG_CLAMP_VAL : + PosClamp[$left(PosClamp)]? (POS_PASSTHROUGH? Xd8 : POS_CLAMP_VAL) : + s1; + + end : genPE + + // All PE results should be valid simultaneously + assign rvld = rvld_vec[0]; + always_ff @(posedge clk) begin + assert(rvld_vec == {(PE){rvld}}) else begin + $error("%m: Inconsistent output valid indications."); + $stop; + end + end + + //=== Credit-backing Elastic Output Queue ================================= + uwire rrdy; + queue #(.DATA_WIDTH($bits(fp_vec_t)), .ELASTICITY(CREDIT)) obuf ( + .clk, .rst, + .idat(r), .ivld(rvld), .irdy(rrdy), + .odat(ydat), .ovld(yvld), .ordy(yrdy) + ); + always_ff @(posedge clk) begin + assert(rrdy || !rvld) else begin + $error("%m: Result queue overrun."); + $stop; + end + end + +endmodule : pwpolyf diff --git a/finn-rtllib/pwpolyf/hdl/pwpolyf_coeffs.svh b/finn-rtllib/pwpolyf/hdl/pwpolyf_coeffs.svh new file mode 100644 index 0000000000..4783a69a8c --- /dev/null +++ b/finn-rtllib/pwpolyf/hdl/pwpolyf_coeffs.svh @@ -0,0 +1,344 @@ +// Auto-generated by pwpolyf_sim.py — do not edit manually. +// K=3, NUM_SEGS=81, NUM_OCTAVES=5, DEGREE=2 + +localparam int unsigned PWPOLYF_K = 3; +localparam int unsigned PWPOLYF_NUM_SEGS = 81; +localparam int unsigned PWPOLYF_NUM_OCTAVES = 5; +localparam int unsigned PWPOLYF_DEGREE = 2; + +localparam logic [31:0] PWPOLYF_GELU_COEFFS[81][3] = '{ + '{ 32'h37B98E70, 32'h3F000000, 32'h3ECA71FE }, // seg 0 + '{ 32'hBA7ADBC7, 32'h3F0278C5, 32'h3EBE3708 }, // seg 1 + '{ 32'hBAC1FAC3, 32'h3F036C36, 32'h3EBAD62D }, // seg 2 + '{ 32'hBB0F7119, 32'h3F049537, 32'h3EB7205C }, // seg 3 + '{ 32'hBB4C3199, 32'h3F05F665, 32'h3EB31D7A }, // seg 4 + '{ 32'hBB8CE270, 32'h3F0793DC, 32'h3EAECF5D }, // seg 5 + '{ 32'hBBBD42FF, 32'h3F096FF1, 32'h3EAA3BDC }, // seg 6 + '{ 32'hBBF86AD7, 32'h3F0B8C93, 32'h3EA5686A }, // seg 7 + '{ 32'hBC1FE938, 32'h3F0DEDAD, 32'h3EA05544 }, // seg 8 + '{ 32'hBC61967B, 32'h3F11FAE4, 32'h3E985544 }, // seg 9 + '{ 32'hBCA9F9E2, 32'h3F1853F6, 32'h3E8D0DA0 }, // seg 10 + '{ 32'hBCF4024D, 32'h3F1FBA03, 32'h3E81380E }, // seg 11 + '{ 32'hBD283275, 32'h3F281F23, 32'h3E6A0515 }, // seg 12 + '{ 32'hBD6012DC, 32'h3F316E8A, 32'h3E5131A8 }, // seg 13 + '{ 32'hBD90E65A, 32'h3F3B8AB1, 32'h3E384EA9 }, // seg 14 + '{ 32'hBDB69B12, 32'h3F46505E, 32'h3E1FAEE2 }, // seg 15 + '{ 32'hBDE0E236, 32'h3F519680, 32'h3E07A0FB }, // seg 16 + '{ 32'hBE13A447, 32'h3F62F98E, 32'h3DCA8920 }, // seg 17 + '{ 32'hBE483C53, 32'h3F7A5DD5, 32'h3D6E8B3A }, // seg 18 + '{ 32'hBE7FADD1, 32'h3F8848AA, 32'h3CC08378 }, // seg 19 + '{ 32'hBE9AF2F6, 32'h3F9227E3, 32'hBB967C2C }, // seg 20 + '{ 32'hBEB35A6C, 32'h3F9A4E1A, 32'hBCD3D2D3 }, // seg 21 + '{ 32'hBEC733E2, 32'h3FA06D38, 32'hBD2655D9 }, // seg 22 + '{ 32'hBED5117D, 32'h3FA46699, 32'hBD4ACA8A }, // seg 23 + '{ 32'hBEDC221E, 32'h3FA64B98, 32'hBD5B0CC8 }, // seg 24 + '{ 32'hBED977F6, 32'h3FA5A98F, 32'hBD563F50 }, // seg 25 + '{ 32'hBEC1A7EF, 32'h3FA066C3, 32'hBD310A4D }, // seg 26 + '{ 32'hBE9AF247, 32'h3F98AA16, 32'hBCFF13C8 }, // seg 27 + '{ 32'hBE609014, 32'h3F90E5FD, 32'hBCA4952F }, // seg 28 + '{ 32'hBE1465DA, 32'h3F8A89C8, 32'hBC412BC3 }, // seg 29 + '{ 32'hBDB39147, 32'h3F860470, 32'hBBCFC6D8 }, // seg 30 + '{ 32'hBD47BD32, 32'h3F832984, 32'hBB4E0A09 }, // seg 31 + '{ 32'hBCCC65EE, 32'h3F8187F7, 32'hBABC9CBA }, // seg 32 + '{ 32'hBC07A969, 32'h3F807817, 32'hB9D51508 }, // seg 33 + '{ 32'hBABA43A4, 32'h3F8012B3, 32'hB870A727 }, // seg 34 + '{ 32'hB93762D6, 32'h3F800216, 32'hB6C22359 }, // seg 35 + '{ 32'h3411E06E, 32'h3F800000, 32'h27AB3551 }, // seg 36 + '{ 32'h341E8FEE, 32'h3F800000, 32'h28A3B0E4 }, // seg 37 + '{ 32'h342B3EFE, 32'h3F800000, 32'hA7CDB1C0 }, // seg 38 + '{ 32'h3437EE42, 32'h3F800000, 32'hA8538CE4 }, // seg 39 + '{ 32'h34449DB9, 32'h3F800000, 32'hA71AF986 }, // seg 40 + '{ 32'hBA7AD37E, 32'h3EFB0E96, 32'h3EBE3747 }, // seg 41 + '{ 32'hBAC20DB3, 32'h3EF92715, 32'h3EBAD556 }, // seg 42 + '{ 32'hBB0F6B5B, 32'h3EF6D5D9, 32'h3EB720C8 }, // seg 43 + '{ 32'hBB4C290B, 32'h3EF41395, 32'h3EB31DFE }, // seg 44 + '{ 32'hBB8CE04D, 32'h3EF0D873, 32'h3EAECF95 }, // seg 45 + '{ 32'hBBBD43D9, 32'h3EED2010, 32'h3EAA3BCD }, // seg 46 + '{ 32'hBBF87DB4, 32'h3EE8E58C, 32'h3EA566F9 }, // seg 47 + '{ 32'hBC1FE3E0, 32'h3EE42555, 32'h3EA055F8 }, // seg 48 + '{ 32'hBC6197C8, 32'h3EDC0A12, 32'h3E98551F }, // seg 49 + '{ 32'hBCA9FA8F, 32'h3ECF57EF, 32'h3E8D0D83 }, // seg 50 + '{ 32'hBCF40310, 32'h3EC08BD5, 32'h3E8137F1 }, // seg 51 + '{ 32'hBD2834D6, 32'h3EAFC0E3, 32'h3E6A03EA }, // seg 52 + '{ 32'hBD6013D1, 32'h3E9D229D, 32'h3E513144 }, // seg 53 + '{ 32'hBD90E5AD, 32'h3E88EB08, 32'h3E384F28 }, // seg 54 + '{ 32'hBDB6985F, 32'h3E66C185, 32'h3E1FB08A }, // seg 55 + '{ 32'hBDE0DF77, 32'h3E39A8DB, 32'h3E07A275 }, // seg 56 + '{ 32'hBE13A40F, 32'h3DE8345F, 32'h3DCA8983 }, // seg 57 + '{ 32'hBE483C8C, 32'h3CB44289, 32'h3D6E8AAF }, // seg 58 + '{ 32'hBE7FAE75, 32'hBD848C95, 32'h3CC08085 }, // seg 59 + '{ 32'hBE9AF25E, 32'hBE113D70, 32'hBB9669BA }, // seg 60 + '{ 32'hBEB35AF3, 32'hBE527226, 32'hBCD3D632 }, // seg 61 + '{ 32'hBEC73409, 32'hBE81B50C, 32'hBD265643 }, // seg 62 + '{ 32'hBED511FB, 32'hBE919AF1, 32'hBD4ACBC1 }, // seg 63 + '{ 32'hBEDC2198, 32'hBE992DD4, 32'hBD5B0BA5 }, // seg 64 + '{ 32'hBED9784C, 32'hBE96A68E, 32'hBD563FE9 }, // seg 65 + '{ 32'hBEC1A80B, 32'hBE819B22, 32'hBD310A73 }, // seg 66 + '{ 32'hBE9AF281, 32'hBE45510D, 32'hBCFF1457 }, // seg 67 + '{ 32'hBE60906A, 32'hBE073026, 32'hBCA4958A }, // seg 68 + '{ 32'hBE146346, 32'hBDA8992A, 32'hBC41276D }, // seg 69 + '{ 32'hBDB38EDC, 32'hBD408B15, 32'hBBCFC36C }, // seg 70 + '{ 32'hBD47AFC4, 32'hBCCA5237, 32'hBB4DF9C3 }, // seg 71 + '{ 32'hBCCC9FAA, 32'hBC443685, 32'hBABCD98B }, // seg 72 + '{ 32'hBC07BF94, 32'hBB7057D9, 32'hB9D53AA2 }, // seg 73 + '{ 32'hBABACF46, 32'hBA160B9E, 32'hB8715A94 }, // seg 74 + '{ 32'hB93E544F, 32'hB88BBAC8, 32'hB6CD58B9 }, // seg 75 + '{ 32'hB80D0FA0, 32'hB7425B86, 32'hB585D73F }, // seg 76 + '{ 32'h00000000, 32'h00000000, 32'h00000000 }, // seg 77 + '{ 32'h00000000, 32'h00000000, 32'h00000000 }, // seg 78 + '{ 32'h00000000, 32'h00000000, 32'h00000000 }, // seg 79 + '{ 32'h00000000, 32'h00000000, 32'h00000000 } // seg 80 +}; + +localparam logic [31:0] PWPOLYF_SILU_COEFFS[81][3] = '{ + '{ 32'h36E95DF5, 32'h3F000000, 32'h3E7EDC5E }, // seg 0 + '{ 32'hB99F1DCE, 32'h3F00C86D, 32'h3E771EC2 }, // seg 1 + '{ 32'hB9F6B213, 32'h3F01162E, 32'h3E74F652 }, // seg 2 + '{ 32'hBA36FFF7, 32'h3F017588, 32'h3E72946C }, // seg 3 + '{ 32'hBA82DBFB, 32'h3F01E7EE, 32'h3E6FFB3C }, // seg 4 + '{ 32'hBAB54FCB, 32'h3F026E5A, 32'h3E6D2ECC }, // seg 5 + '{ 32'hBAF49B79, 32'h3F030A0C, 32'h3E6A30AE }, // seg 6 + '{ 32'hBB212F9B, 32'h3F03BBB5, 32'h3E6704CC }, // seg 7 + '{ 32'hBB50782F, 32'h3F048563, 32'h3E63A86F }, // seg 8 + '{ 32'hBB945CE5, 32'h3F05E1D4, 32'h3E5E4873 }, // seg 9 + '{ 32'hBBE25E30, 32'h3F080BF2, 32'h3E569788 }, // seg 10 + '{ 32'hBC24C259, 32'h3F0A9F94, 32'h3E4E59B3 }, // seg 11 + '{ 32'hBC66B42F, 32'h3F0D9E75, 32'h3E45A38E }, // seg 12 + '{ 32'hBC9C5244, 32'h3F11080E, 32'h3E3C8A8D }, // seg 13 + '{ 32'hBCCE04C3, 32'h3F14DA5B, 32'h3E3322D2 }, // seg 14 + '{ 32'hBD04800C, 32'h3F191096, 32'h3E298289 }, // seg 15 + '{ 32'hBD26E43B, 32'h3F1DA640, 32'h3E1FBAD7 }, // seg 16 + '{ 32'hBD637E9F, 32'h3F252125, 32'h3E10F462 }, // seg 17 + '{ 32'hBDA33B9D, 32'h3F301FA3, 32'h3DFAD009 }, // seg 18 + '{ 32'hBDDE6B32, 32'h3F3BF5F1, 32'h3DD4EBDE }, // seg 19 + '{ 32'hBE1121CB, 32'h3F484C5B, 32'h3DB103B4 }, // seg 20 + '{ 32'hBE369CC7, 32'h3F54CBA1, 32'h3D8FABD4 }, // seg 21 + '{ 32'hBE5EB1E1, 32'h3F612225, 32'h3D6290D3 }, // seg 22 + '{ 32'hBE842968, 32'h3F6D086D, 32'h3D2C2202 }, // seg 23 + '{ 32'hBE99340A, 32'h3F7842D6, 32'h3CF862F4 }, // seg 24 + '{ 32'hBEB7B0F6, 32'h3F83AB48, 32'h3C810FDC }, // seg 25 + '{ 32'hBEDD21FD, 32'h3F8C031A, 32'h3AA0895B }, // seg 26 + '{ 32'hBEFBE1D7, 32'h3F922E5A, 32'hBC0A6628 }, // seg 27 + '{ 32'hBF0931A3, 32'h3F9649B5, 32'hBC6A59DD }, // seg 28 + '{ 32'hBF101E8F, 32'h3F989B85, 32'hBC8E0AB0 }, // seg 29 + '{ 32'hBF12EEED, 32'h3F997B24, 32'hBC96B85F }, // seg 30 + '{ 32'hBF121CE8, 32'h3F994071, 32'hBC94AB86 }, // seg 31 + '{ 32'hBF0E4CE0, 32'h3F983CFA, 32'hBC8C0C06 }, // seg 32 + '{ 32'hBF047881, 32'h3F95D0C8, 32'hBC71DF34 }, // seg 33 + '{ 32'hBEE597C0, 32'h3F91E386, 32'hBC3A0377 }, // seg 34 + '{ 32'hBEBEB59D, 32'h3F8DFF08, 32'hBC081E5A }, // seg 35 + '{ 32'hBE994535, 32'h3F8A965C, 32'hBBC0C1D9 }, // seg 36 + '{ 32'hBE700CF7, 32'h3F87CFE4, 32'hBB856F55 }, // seg 37 + '{ 32'hBE37FE86, 32'h3F85A6F4, 32'hBB35A189 }, // seg 38 + '{ 32'hBE0A8F49, 32'h3F8406CD, 32'hBAF420DA }, // seg 39 + '{ 32'hBDCD8C89, 32'h3F82D4E6, 32'hBAA265F3 }, // seg 40 + '{ 32'hB99F15B4, 32'h3EFE6F36, 32'h3E771F07 }, // seg 41 + '{ 32'hB9F6E275, 32'h3EFDD355, 32'h3E74F54A }, // seg 42 + '{ 32'hBA370764, 32'h3EFD14DB, 32'h3E729434 }, // seg 43 + '{ 32'hBA82CBD2, 32'h3EFC307E, 32'h3E6FFC3C }, // seg 44 + '{ 32'hBAB519A0, 32'h3EFB2461, 32'h3E6D318E }, // seg 45 + '{ 32'hBAF49002, 32'h3EF9EC20, 32'h3E6A3138 }, // seg 46 + '{ 32'hBB213894, 32'h3EF88848, 32'h3E670422 }, // seg 47 + '{ 32'hBB509289, 32'h3EF6F461, 32'h3E63A6AF }, // seg 48 + '{ 32'hBB946026, 32'h3EF43C28, 32'h3E5E481D }, // seg 49 + '{ 32'hBBE25D58, 32'h3EEFE827, 32'h3E56979C }, // seg 50 + '{ 32'hBC24C311, 32'h3EEAC0C7, 32'h3E4E599C }, // seg 51 + '{ 32'hBC66B0C8, 32'h3EE4C363, 32'h3E45A3FC }, // seg 52 + '{ 32'hBC9C5250, 32'h3EDDEFE3, 32'h3E3C8A8A }, // seg 53 + '{ 32'hBCCE0263, 32'h3ED64BA6, 32'h3E33233F }, // seg 54 + '{ 32'hBD0483B9, 32'h3ECDDDD0, 32'h3E29816B }, // seg 55 + '{ 32'hBD26E21A, 32'h3EC4B40E, 32'h3E1FBB6C }, // seg 56 + '{ 32'hBD637E94, 32'h3EB5BDB7, 32'h3E10F463 }, // seg 57 + '{ 32'hBDA33BD9, 32'h3E9FC0A2, 32'h3DFACFDF }, // seg 58 + '{ 32'hBDDE6AB7, 32'h3E88144E, 32'h3DD4EC2A }, // seg 59 + '{ 32'hBE11213C, 32'h3E5ECF5B, 32'h3DB10440 }, // seg 60 + '{ 32'hBE369BA8, 32'h3E2CD2EE, 32'h3D8FACC1 }, // seg 61 + '{ 32'hBE5EB20C, 32'h3DF6EE82, 32'h3D6290A4 }, // seg 62 + '{ 32'hBE8428DF, 32'h3D97BEFB, 32'h3D2C2354 }, // seg 63 + '{ 32'hBE9933B8, 32'h3CF7AA94, 32'h3CF8644F }, // seg 64 + '{ 32'hBEB7B09B, 32'hBCEACCA1, 32'h3C811126 }, // seg 65 + '{ 32'hBEDD2230, 32'hBDC0323A, 32'h3AA081A5 }, // seg 66 + '{ 32'hBEFBE177, 32'hBE11723B, 32'hBC0A645F }, // seg 67 + '{ 32'hBF093217, 32'hBE324EF2, 32'hBC6A5D77 }, // seg 68 + '{ 32'hBF101F44, 32'hBE44DDF5, 32'hBC8E0CF8 }, // seg 69 + '{ 32'hBF12EEFF, 32'hBE4BD952, 32'hBC96B899 }, // seg 70 + '{ 32'hBF121E42, 32'hBE4A0685, 32'hBC94AED3 }, // seg 71 + '{ 32'hBF0E4E2A, 32'hBE41EA78, 32'hBC8C0EC0 }, // seg 72 + '{ 32'hBF047922, 32'hBE2E876E, 32'hBC71E175 }, // seg 73 + '{ 32'hBEE5994E, 32'hBE0F1D79, 32'hBC3A059D }, // seg 74 + '{ 32'hBEBEB455, 32'hBDDFEE7F, 32'hBC081CCC }, // seg 75 + '{ 32'hBE9948EA, 32'hBDA96AD6, 32'hBBC0C8E9 }, // seg 76 + '{ 32'hBE701045, 32'hBD7A00B1, 32'hBB8571EB }, // seg 77 + '{ 32'hBE3805EB, 32'hBD34E735, 32'hBB35ABED }, // seg 78 + '{ 32'hBE0A9538, 32'hBD00E03E, 32'hBAF42F6A }, // seg 79 + '{ 32'hBDCD9AF0, 32'hBCB54853, 32'hBAA2755A } // seg 80 +}; + +localparam logic [31:0] PWPOLYF_SIGMOID_COEFFS[81][3] = '{ + '{ 32'h3F000000, 32'h3E7F33B4, 32'hB21FFF88 }, // seg 0 + '{ 32'h3EFFCF27, 32'h3E822CCD, 32'hBC84C1F2 }, // seg 1 + '{ 32'h3EFFBC74, 32'h3E82B1D2, 32'hBC938C36 }, // seg 2 + '{ 32'h3EFFA5B5, 32'h3E834361, 32'hBCA21BEF }, // seg 3 + '{ 32'h3EFF8A9F, 32'h3E83E0E0, 32'hBCB06BD4 }, // seg 4 + '{ 32'h3EFF6B53, 32'h3E8487A2, 32'hBCBE4EDE }, // seg 5 + '{ 32'h3EFF47DE, 32'h3E853610, 32'hBCCBB848 }, // seg 6 + '{ 32'h3EFF1FD8, 32'h3E85ED00, 32'hBCD8C92D }, // seg 7 + '{ 32'h3EFEF3E7, 32'h3E86A89A, 32'hBCE54D90 }, // seg 8 + '{ 32'h3EFEA7FC, 32'h3E87D4B7, 32'hBCF7D81E }, // seg 9 + '{ 32'h3EFE3434, 32'h3E897082, 32'hBD075E74 }, // seg 10 + '{ 32'h3EFDB0A7, 32'h3E8B15AC, 32'hBD11E821 }, // seg 11 + '{ 32'h3EFD1FFF, 32'h3E8CBAB3, 32'hBD1B7B94 }, // seg 12 + '{ 32'h3EFC85A0, 32'h3E8E568F, 32'hBD2411C8 }, // seg 13 + '{ 32'h3EFBE681, 32'h3E8FDE72, 32'hBD2B9C7D }, // seg 14 + '{ 32'h3EFB46C8, 32'h3E914BCF, 32'hBD322448 }, // seg 15 + '{ 32'h3EFAAB5A, 32'h3E9297AA, 32'hBD37AD8F }, // seg 16 + '{ 32'h3EF9DBB3, 32'h3E9432BF, 32'hBD3E0993 }, // seg 17 + '{ 32'h3EF909EB, 32'h3E95A9C9, 32'hBD43470B }, // seg 18 + '{ 32'h3EF8B573, 32'h3E9632E0, 32'hBD450418 }, // seg 19 + '{ 32'h3EF909F7, 32'h3E95B9B4, 32'hBD43A89C }, // seg 20 + '{ 32'h3EFA2B65, 32'h3E943959, 32'hBD3FAB94 }, // seg 21 + '{ 32'h3EFC33ED, 32'h3E91B9FE, 32'hBD3988A9 }, // seg 22 + '{ 32'h3EFF34ED, 32'h3E8E4C2C, 32'hBD31B440 }, // seg 23 + '{ 32'h3F01997B, 32'h3E8A0ACB, 32'hBD28A198 }, // seg 24 + '{ 32'h3F057A8B, 32'h3E8262C5, 32'hBD198434 }, // seg 25 + '{ 32'h3F0C3064, 32'h3E6CECF8, 32'hBD0452A9 }, // seg 26 + '{ 32'h3F1447E1, 32'h3E530782, 32'hBCDF318F }, // seg 27 + '{ 32'h3F1D4A68, 32'h3E38CE80, 32'hBCB905FB }, // seg 28 + '{ 32'h3F26C195, 32'h3E1F8C72, 32'hBC9750E8 }, // seg 29 + '{ 32'h3F3044A9, 32'h3E081DC6, 32'hBC74E58A }, // seg 30 + '{ 32'h3F3982E1, 32'h3DE5F174, 32'hBC448461 }, // seg 31 + '{ 32'h3F424019, 32'h3DC09FBD, 32'hBC1CAB6F }, // seg 32 + '{ 32'h3F4DF478, 32'h3D924A5B, 32'hBBDD9BF8 }, // seg 33 + '{ 32'h3F5B2AC7, 32'h3D464E25, 32'hBB897CAF }, // seg 34 + '{ 32'h3F656FAC, 32'h3D045E36, 32'hBB291861 }, // seg 35 + '{ 32'h3F6D25C5, 32'h3CAEBA7A, 32'hBACED519 }, // seg 36 + '{ 32'h3F72CA96, 32'h3C64B5A7, 32'hBA7C2B6D }, // seg 37 + '{ 32'h3F76D7D9, 32'h3C14B4A4, 32'hBA196B60 }, // seg 38 + '{ 32'h3F79B538, 32'h3BC060EC, 32'hB9BA7A5A }, // seg 39 + '{ 32'h3F7BB5E3, 32'h3B77B8AD, 32'hB9626A80 }, // seg 40 + '{ 32'h3F001880, 32'h3E822DEE, 32'h3C84E3F8 }, // seg 41 + '{ 32'h3F0021CE, 32'h3E82B23F, 32'h3C939805 }, // seg 42 + '{ 32'h3F002D26, 32'h3E834368, 32'h3CA21C3C }, // seg 43 + '{ 32'h3F003A99, 32'h3E83DFE3, 32'h3CB05650 }, // seg 44 + '{ 32'h3F004A09, 32'h3E84848C, 32'h3CBE0FC2 }, // seg 45 + '{ 32'h3F005BFD, 32'h3E853553, 32'h3CCBAA1A }, // seg 46 + '{ 32'h3F007027, 32'h3E85EDAA, 32'h3CD8D518 }, // seg 47 + '{ 32'h3F008689, 32'h3E86AC9A, 32'h3CE58F3C }, // seg 48 + '{ 32'h3F00AC00, 32'h3E87D4A7, 32'h3CF7D732 }, // seg 49 + '{ 32'h3F00E601, 32'h3E89713C, 32'h3D076365 }, // seg 50 + '{ 32'h3F0127C8, 32'h3E8B1652, 32'h3D11EC06 }, // seg 51 + '{ 32'h3F016FFD, 32'h3E8CBAA0, 32'h3D1B7B31 }, // seg 52 + '{ 32'h3F01BD0C, 32'h3E8E55D8, 32'h3D240E23 }, // seg 53 + '{ 32'h3F020CD3, 32'h3E8FDED1, 32'h3D2B9E4A }, // seg 54 + '{ 32'h3F025CCD, 32'h3E914CAB, 32'h3D322819 }, // seg 55 + '{ 32'h3F02AA33, 32'h3E929729, 32'h3D37AB80 }, // seg 56 + '{ 32'h3F031236, 32'h3E9432FB, 32'h3D3E0A78 }, // seg 57 + '{ 32'h3F037AF9, 32'h3E95A98C, 32'h3D43463B }, // seg 58 + '{ 32'h3F03A527, 32'h3E96327F, 32'h3D4502ED }, // seg 59 + '{ 32'h3F037B01, 32'h3E95B9AA, 32'h3D43A880 }, // seg 60 + '{ 32'h3F02EA67, 32'h3E94399C, 32'h3D3FAC3D }, // seg 61 + '{ 32'h3F01E5FE, 32'h3E91B9E3, 32'h3D39886A }, // seg 62 + '{ 32'h3F00659B, 32'h3E8E4C54, 32'h3D31B499 }, // seg 63 + '{ 32'h3EFCCCD9, 32'h3E8A0A9A, 32'h3D28A135 }, // seg 64 + '{ 32'h3EF50AC9, 32'h3E8262A6, 32'h3D1983F9 }, // seg 65 + '{ 32'h3EE79F36, 32'h3E6CECF3, 32'h3D0452A4 }, // seg 66 + '{ 32'h3ED77017, 32'h3E530747, 32'h3CDF3133 }, // seg 67 + '{ 32'h3EC56B50, 32'h3E38CEAF, 32'h3CB9063C }, // seg 68 + '{ 32'h3EB27DCB, 32'h3E1F8DAD, 32'h3C97527C }, // seg 69 + '{ 32'h3E9F76E9, 32'h3E081E0B, 32'h3C74E62F }, // seg 70 + '{ 32'h3E8CFAEA, 32'h3DE5F2F3, 32'h3C448608 }, // seg 71 + '{ 32'h3E770090, 32'h3DC0A0B8, 32'h3C1CAC71 }, // seg 72 + '{ 32'h3E482E7E, 32'h3D924AB4, 32'h3BDD9CA1 }, // seg 73 + '{ 32'h3E13552B, 32'h3D464E9B, 32'h3B897D12 }, // seg 74 + '{ 32'h3DD481AF, 32'h3D045D7B, 32'h3B29173F }, // seg 75 + '{ 32'h3D96D28F, 32'h3CAEBB74, 32'h3ACED672 }, // seg 76 + '{ 32'h3D5358A4, 32'h3C64B82E, 32'h3A7C2E99 }, // seg 77 + '{ 32'h3D1283F5, 32'h3C14B667, 32'h3A196D6B }, // seg 78 + '{ 32'h3CC95645, 32'h3BC05DE7, 32'h39BA76F4 }, // seg 79 + '{ 32'h3C8947BE, 32'h3B77C111, 32'h39627305 } // seg 80 +}; + +localparam logic [31:0] PWPOLYF_TANH_COEFFS[81][3] = '{ + '{ 32'h24C775B8, 32'h3F7CD991, 32'hA73006D1 }, // seg 0 + '{ 32'hBBAC00F6, 32'h3F87D4AF, 32'hBE77D79E }, // seg 1 + '{ 32'hBBE5F686, 32'h3F8970F2, 32'hBE87616C }, // seg 2 + '{ 32'hBC13E04D, 32'h3F8B1626, 32'hBE91EAFF }, // seg 3 + '{ 32'hBC38062F, 32'h3F8CBAF6, 32'hBE9B7D12 }, // seg 4 + '{ 32'hBC5E87C9, 32'h3F8E55E8, 32'hBEA40E6C }, // seg 5 + '{ 32'hBC833183, 32'h3F8FDE94, 32'hBEAB9D26 }, // seg 6 + '{ 32'hBC97304B, 32'h3F914C74, 32'hBEB22725 }, // seg 7 + '{ 32'hBCAA905C, 32'h3F929762, 32'hBEB7AC6C }, // seg 8 + '{ 32'hBCC48DB5, 32'h3F9432FE, 32'hBEBE0A82 }, // seg 9 + '{ 32'hBCDEBF5B, 32'h3F95A99C, 32'hBEC34673 }, // seg 10 + '{ 32'hBCE94C8C, 32'h3F9632A2, 32'hBEC50358 }, // seg 11 + '{ 32'hBCDEBE2C, 32'h3F95B992, 32'hBEC3A83E }, // seg 12 + '{ 32'hBCBA9ED9, 32'h3F9439CF, 32'hBEBFACBF }, // seg 13 + '{ 32'hBC72F4C3, 32'h3F91B9B3, 32'hBEB987F9 }, // seg 14 + '{ 32'hBB4B4F04, 32'h3F8E4C70, 32'hBEB1B4D9 }, // seg 15 + '{ 32'h3C4CBE96, 32'h3F8A0AC9, 32'hBEA8A196 }, // seg 16 + '{ 32'h3D2F5351, 32'h3F8262A8, 32'hBE9983FC }, // seg 17 + '{ 32'h3DC30600, 32'h3F6CED04, 32'hBE8452B2 }, // seg 18 + '{ 32'h3E223FEB, 32'h3F53072C, 32'hBE5F310C }, // seg 19 + '{ 32'h3E6A52FA, 32'h3F38CE9A, 32'hBE39061F }, // seg 20 + '{ 32'h3E9B0431, 32'h3F1F8DD2, 32'hBE1752AC }, // seg 21 + '{ 32'h3EC1122B, 32'h3F081E0D, 32'hBDF4E633 }, // seg 22 + '{ 32'h3EE60A55, 32'h3EE5F2C5, 32'hBDC485D6 }, // seg 23 + '{ 32'h3F047F8C, 32'h3EC0A114, 32'hBD9CACD1 }, // seg 24 + '{ 32'h3F1BE8C6, 32'h3E924AAB, 32'hBD5D9C90 }, // seg 25 + '{ 32'h3F365584, 32'h3E464E44, 32'hBD097CC9 }, // seg 26 + '{ 32'h3F4ADF8C, 32'h3E045D95, 32'hBCA91767 }, // seg 27 + '{ 32'h3F5A4B6D, 32'h3DAEBB15, 32'hBC4ED5EF }, // seg 28 + '{ 32'h3F6594EE, 32'h3D64B815, 32'hBBFC2E78 }, // seg 29 + '{ 32'h3F6DAF91, 32'h3D14B5D0, 32'hBB996CB5 }, // seg 30 + '{ 32'h3F736A9F, 32'h3CC05DAA, 32'hBB3A76B3 }, // seg 31 + '{ 32'h3F776B54, 32'h3C77C747, 32'hBAE2796C }, // seg 32 + '{ 32'h3F7B290A, 32'h3C00F553, 32'hBA590596 }, // seg 33 + '{ 32'h3F7DD423, 32'h3B51CDAD, 32'hB99FBCDD }, // seg 34 + '{ 32'h3F7F0B0E, 32'h3AA91301, 32'hB8EB151C }, // seg 35 + '{ 32'h3F7F95A3, 32'h3A073EF5, 32'hB82D041E }, // seg 36 + '{ 32'h3F7FD274, 32'h3956AD85, 32'hB77E46D5 }, // seg 37 + '{ 32'h3F7FECAF, 32'h38A9A5DF, 32'hB6BB164C }, // seg 38 + '{ 32'h3F7FF7E0, 32'h3805A163, 32'hB609E0DD }, // seg 39 + '{ 32'h3F7FFCA1, 32'h37505C2F, 32'hB549DC3A }, // seg 40 + '{ 32'h3BAC00F6, 32'h3F87D4AF, 32'h3E77D79E }, // seg 41 + '{ 32'h3BE5F686, 32'h3F8970F2, 32'h3E87616C }, // seg 42 + '{ 32'h3C13E04D, 32'h3F8B1626, 32'h3E91EAFF }, // seg 43 + '{ 32'h3C38062F, 32'h3F8CBAF6, 32'h3E9B7D12 }, // seg 44 + '{ 32'h3C5E87C9, 32'h3F8E55E8, 32'h3EA40E6C }, // seg 45 + '{ 32'h3C833183, 32'h3F8FDE94, 32'h3EAB9D26 }, // seg 46 + '{ 32'h3C97304B, 32'h3F914C74, 32'h3EB22725 }, // seg 47 + '{ 32'h3CAA905C, 32'h3F929762, 32'h3EB7AC6C }, // seg 48 + '{ 32'h3CC48DB5, 32'h3F9432FE, 32'h3EBE0A82 }, // seg 49 + '{ 32'h3CDEBF5B, 32'h3F95A99C, 32'h3EC34673 }, // seg 50 + '{ 32'h3CE94C8C, 32'h3F9632A2, 32'h3EC50358 }, // seg 51 + '{ 32'h3CDEBE2C, 32'h3F95B992, 32'h3EC3A83E }, // seg 52 + '{ 32'h3CBA9ED9, 32'h3F9439CF, 32'h3EBFACBF }, // seg 53 + '{ 32'h3C72F4C3, 32'h3F91B9B3, 32'h3EB987F9 }, // seg 54 + '{ 32'h3B4B4F04, 32'h3F8E4C70, 32'h3EB1B4D9 }, // seg 55 + '{ 32'hBC4CBE96, 32'h3F8A0AC9, 32'h3EA8A196 }, // seg 56 + '{ 32'hBD2F5351, 32'h3F8262A8, 32'h3E9983FC }, // seg 57 + '{ 32'hBDC30600, 32'h3F6CED04, 32'h3E8452B2 }, // seg 58 + '{ 32'hBE223FEB, 32'h3F53072C, 32'h3E5F310C }, // seg 59 + '{ 32'hBE6A52FA, 32'h3F38CE9A, 32'h3E39061F }, // seg 60 + '{ 32'hBE9B0431, 32'h3F1F8DD2, 32'h3E1752AC }, // seg 61 + '{ 32'hBEC1122B, 32'h3F081E0D, 32'h3DF4E633 }, // seg 62 + '{ 32'hBEE60A55, 32'h3EE5F2C5, 32'h3DC485D6 }, // seg 63 + '{ 32'hBF047F8C, 32'h3EC0A114, 32'h3D9CACD1 }, // seg 64 + '{ 32'hBF1BE8C6, 32'h3E924AAB, 32'h3D5D9C90 }, // seg 65 + '{ 32'hBF365584, 32'h3E464E44, 32'h3D097CC9 }, // seg 66 + '{ 32'hBF4ADF8C, 32'h3E045D95, 32'h3CA91767 }, // seg 67 + '{ 32'hBF5A4B6D, 32'h3DAEBB15, 32'h3C4ED5EF }, // seg 68 + '{ 32'hBF6594EE, 32'h3D64B815, 32'h3BFC2E78 }, // seg 69 + '{ 32'hBF6DAF91, 32'h3D14B5D0, 32'h3B996CB5 }, // seg 70 + '{ 32'hBF736A9F, 32'h3CC05DAA, 32'h3B3A76B3 }, // seg 71 + '{ 32'hBF776B54, 32'h3C77C747, 32'h3AE2796C }, // seg 72 + '{ 32'hBF7B290A, 32'h3C00F553, 32'h3A590596 }, // seg 73 + '{ 32'hBF7DD423, 32'h3B51CDAD, 32'h399FBCDD }, // seg 74 + '{ 32'hBF7F0B0E, 32'h3AA91301, 32'h38EB151C }, // seg 75 + '{ 32'hBF7F95A3, 32'h3A073EF5, 32'h382D041E }, // seg 76 + '{ 32'hBF7FD274, 32'h3956AD85, 32'h377E46D5 }, // seg 77 + '{ 32'hBF7FECAF, 32'h38A9A5DF, 32'h36BB164C }, // seg 78 + '{ 32'hBF7FF7E0, 32'h3805A163, 32'h3609E0DD }, // seg 79 + '{ 32'hBF7FFCA1, 32'h37505C2F, 32'h3549DC3A } // seg 80 +}; + diff --git a/finn-rtllib/pwpolyf/hdl/pwpolyf_template_wrapper.v b/finn-rtllib/pwpolyf/hdl/pwpolyf_template_wrapper.v new file mode 100644 index 0000000000..eecf2ac74d --- /dev/null +++ b/finn-rtllib/pwpolyf/hdl/pwpolyf_template_wrapper.v @@ -0,0 +1,69 @@ +/****************************************************************************** + * Copyright (C) 2026, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @brief Verilog wrapper for pwpolyf IP packaging. + */ + +module $MODULE_NAME_AXI_WRAPPER$ #( + parameter PE = $PE$, + parameter FUNC = $FUNC$ +)( + (* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF in0_V:out_V, ASSOCIATED_RESET ap_rst_n" *) + (* X_INTERFACE_INFO = "xilinx.com:signal:clock:1.0 ap_clk CLK" *) + input ap_clk, + (* X_INTERFACE_PARAMETER = "POLARITY ACTIVE_LOW" *) + input ap_rst_n, + + //- AXI Stream - Input -------------- + output in0_V_TREADY, + input in0_V_TVALID, + input [$IN_WIDTH$-1:0] in0_V_TDATA, + + //- AXI Stream - Output ------------- + input out_V_TREADY, + output out_V_TVALID, + output [$OUT_WIDTH$-1:0] out_V_TDATA +); + + pwpolyf #( + .PE(PE), + .FUNC(FUNC) + ) core ( + .clk(ap_clk), + .rst(!ap_rst_n), + .xdat(in0_V_TDATA), + .xvld(in0_V_TVALID), + .xrdy(in0_V_TREADY), + .ydat(out_V_TDATA), + .yvld(out_V_TVALID), + .yrdy(out_V_TREADY) + ); + +endmodule // $MODULE_NAME_AXI_WRAPPER$ diff --git a/finn-rtllib/pwpolyf/hdl/queue.sv b/finn-rtllib/pwpolyf/hdl/queue.sv new file mode 100755 index 0000000000..e5c3cf9889 --- /dev/null +++ b/finn-rtllib/pwpolyf/hdl/queue.sv @@ -0,0 +1,78 @@ +/**************************************************************************** + * Copyright (C) 2025, Advanced Micro Devices, Inc. + * All rights reserved. + * + * SPDX-License-Identifier: BSD-3-Clause + * + * @author Thomas B. Preußer + ***************************************************************************/ + +module queue #( + int unsigned DATA_WIDTH, + int unsigned ELASTICITY +)( + input logic clk, + input logic rst, + + input logic [DATA_WIDTH-1:0] idat, + input logic ivld, + output logic irdy, + + output logic [DATA_WIDTH-1:0] odat, + output logic ovld, + input logic ordy +); + + typedef logic [DATA_WIDTH-1:0] dat_t; + initial begin + if(ELASTICITY < 2) begin + $error("%m: ELASTICITY of %0d must be made 2 or above.", ELASTICITY); + $finish; + end + end + + logic signed [$clog2(ELASTICITY):0] Ptr = '1; // -1, 0, 1, ..., ELASTICITY-1 + logic Rdy = 1; + dat_t A[ELASTICITY]; + assign irdy = Rdy; + + logic Vld = 0; + dat_t B = 'x; + assign odat = B; + assign ovld = Vld; + + uwire bload = !Vld || ordy; + uwire push = Rdy && ivld; + uwire pop = !Ptr[$left(Ptr)] && bload; + + always_ff @(posedge clk) begin + if(push) A <= { idat, A[0:ELASTICITY-2] }; + end + + always_ff @(posedge clk) begin + if(rst) begin + Ptr <= '1; + Rdy <= 1; + Vld <= 0; + B <= 'x; + end + else begin + // Make sure Rdy encodes what it's supposed to: space available in queue + assert(Rdy == (Ptr < signed'(ELASTICITY-1))) else begin + $error("%m: Broken Rdy computation."); + $stop; + end + + Ptr <= Ptr + ((push == pop)? 0 : push? 1 : -1); + // pop == push: no change + // pop && !push: new space + // !pop && push: remaining space if not yet Ptr == ELASTICITY-2 + Rdy <= (pop == push)? Rdy : pop? 1 : Ptr[$left(Ptr)] || (((ELASTICITY-2) & ~Ptr[$left(Ptr)-1:0]) != 0); + if(bload) begin + Vld <= !Ptr[$left(Ptr)]; + B <= A[Ptr[$left(Ptr)-1:0]]; + end + end + end + +endmodule : queue diff --git a/src/finn/builder/build_dataflow_steps.py b/src/finn/builder/build_dataflow_steps.py index ecc1d28c53..dad7910e5c 100644 --- a/src/finn/builder/build_dataflow_steps.py +++ b/src/finn/builder/build_dataflow_steps.py @@ -350,6 +350,8 @@ def step_convert_to_hw(model: ModelWrapper, cfg: DataflowBuildConfig): model = model.transform(to_hw.InferLabelSelectLayer()) # input quantization (if any) as standalone threshold model = model.transform(to_hw.InferThresholdingLayer()) + # piecewise polynomial activations (GELU, SiLU, Sigmoid, Tanh) + model = model.transform(to_hw.InferPWPolyFLayer()) # needed for convolutions -- TODO always exec? need_conv = len(model.get_nodes_by_op_type("Im2Col")) > 0 if need_conv: diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py index aed2ab7fe1..c924b538a0 100644 --- a/src/finn/custom_op/fpgadataflow/__init__.py +++ b/src/finn/custom_op/fpgadataflow/__init__.py @@ -42,6 +42,7 @@ from finn.custom_op.fpgadataflow.lookup import Lookup from finn.custom_op.fpgadataflow.matrixvectoractivation import MVAU from finn.custom_op.fpgadataflow.pool import Pool +from finn.custom_op.fpgadataflow.pwpolyf import PWPolyF from finn.custom_op.fpgadataflow.streamingdataflowpartition import ( StreamingDataflowPartition, ) @@ -76,6 +77,7 @@ custom_op["LabelSelect"] = LabelSelect custom_op["Lookup"] = Lookup custom_op["Pool"] = Pool +custom_op["PWPolyF"] = PWPolyF custom_op["StreamingConcat"] = StreamingConcat custom_op["StreamingDataWidthConverter"] = StreamingDataWidthConverter custom_op["StreamingEltwise"] = StreamingEltwise diff --git a/src/finn/custom_op/fpgadataflow/pwpolyf.py b/src/finn/custom_op/fpgadataflow/pwpolyf.py new file mode 100644 index 0000000000..e05ba9c2aa --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/pwpolyf.py @@ -0,0 +1,187 @@ +# Copyright (C) 2026, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import numpy as np +from qonnx.core.datatype import DataType + +from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp + +# Piecewise polynomial constants matching the RTL module +_NUM_OCTAVES = 5 +_SUPPORTED_FUNCS = {"gelu", "silu", "sigmoid", "tanh"} + + +class PWPolyF(HWCustomOp): + """ + HW op for piecewise polynomial activations (GELU, SiLU, Sigmoid, Tanh). + + Element-wise FP32, coefficients baked into RTL. No weights or BRAM. + """ + + def __init__(self, onnx_node, **kwargs): + super().__init__(onnx_node, **kwargs) + + def get_nodeattr_types(self): + my_attrs = { + # activation function: gelu, silu, sigmoid, tanh + "func": ("s", True, ""), + # top-mantissa subdivision bits (K=3 gives 81 segments) + "K": ("i", False, 3), + # parallelism; elements processed per cycle + "PE": ("i", True, 0), + # number of channels (last dimension of input tensor) + "NumChannels": ("i", True, 0), + # FINN DataTypes for inputs, outputs (always FLOAT32) + "inputDataType": ("s", True, ""), + "outputDataType": ("s", True, ""), + # number of input vectors, examples: + # [1] is a single vector (like a FC layer with batch=1) + # [4] is four vectors (like a FC layer with batch=4) + # [1, 4, 4] is four * four vectors (like a conv layer with batch=1) + "numInputVectors": ("ints", False, [1]), + } + my_attrs.update(super().get_nodeattr_types()) + return my_attrs + + def get_num_segments(self): + K = self.get_nodeattr("K") + return 1 + 2 * _NUM_OCTAVES * (1 << K) + + def make_shape_compatible_op(self, model): + oshape = self.get_normal_output_shape() + return super().make_const_shape_op(oshape) + + def infer_node_datatype(self, model): + node = self.onnx_node + idt = model.get_tensor_datatype(node.input[0]) + if idt != self.get_input_datatype(): + self.set_nodeattr("inputDataType", idt.name) + odt = self.get_output_datatype() + model.set_tensor_datatype(node.output[0], odt) + + def verify_node(self): + info_messages = [] + + backend_value = self.get_nodeattr("backend") + if backend_value == "fpgadataflow": + info_messages.append("Attribute backend is set correctly") + else: + info_messages.append('Attribute backend should be set to "fpgadataflow"') + + func = self.get_nodeattr("func") + if func in _SUPPORTED_FUNCS: + info_messages.append("Attribute func is set correctly") + else: + info_messages.append( + "Attribute func must be one of %s, got %s" % (_SUPPORTED_FUNCS, func) + ) + + pe = self.get_nodeattr("PE") + nch = self.get_nodeattr("NumChannels") + if pe > 0 and nch > 0 and nch % pe == 0: + info_messages.append("PE divides NumChannels") + else: + info_messages.append("PE must divide NumChannels evenly") + + idt = self.get_nodeattr("inputDataType") + if idt != "FLOAT32": + info_messages.append("PWPolyF requires FLOAT32 input, got %s" % idt) + + return info_messages + + def get_input_datatype(self, ind=0): + """Returns FINN DataType of input.""" + return DataType[self.get_nodeattr("inputDataType")] + + def get_output_datatype(self, ind=0): + """Returns FINN DataType of output.""" + return DataType[self.get_nodeattr("outputDataType")] + + def get_instream_width(self, ind=0): + return self.get_input_datatype().bitwidth() * self.get_nodeattr("PE") + + def get_outstream_width(self, ind=0): + return self.get_output_datatype().bitwidth() * self.get_nodeattr("PE") + + def get_folded_input_shape(self, ind=0): + pe = self.get_nodeattr("PE") + nch = self.get_nodeattr("NumChannels") + fold = nch // pe + vecs = list(self.get_nodeattr("numInputVectors")) + return tuple(vecs + [fold, pe]) + + def get_folded_output_shape(self, ind=0): + return self.get_folded_input_shape() + + def get_normal_input_shape(self, ind=0): + nch = self.get_nodeattr("NumChannels") + vecs = list(self.get_nodeattr("numInputVectors")) + return tuple(vecs + [nch]) + + def get_normal_output_shape(self, ind=0): + return self.get_normal_input_shape() + + def get_number_output_values(self): + return np.prod(self.get_folded_output_shape()[:-1]) + + def get_exp_cycles(self): + # II=1, latency amortised over stream length + return np.prod(self.get_folded_output_shape()[:-1]) + + def lut_estimation(self): + pe = self.get_nodeattr("PE") + return 200 * pe + + def bram_estimation(self): + # coefficients stored in LUT ROM, not BRAM + return 0 + + def uram_estimation(self): + return 0 + + def dsp_estimation(self, fpgapart=None): + # two DSPFP32 FMA instances per PE (Horner evaluation) + pe = self.get_nodeattr("PE") + return 2 * pe + + def execute_node(self, context, graph): + node = self.onnx_node + inp = context[node.input[0]] + + func = self.get_nodeattr("func") + K = self.get_nodeattr("K") + + # lazy import to avoid hard dependency on torch at module level + import torch + from finn.util.pwpolyf import PiecewisePolyActivation + + mod = PiecewisePolyActivation(func, K=K) + with torch.no_grad(): + x = torch.from_numpy(inp.astype(np.float32)) + y = mod(x) + context[node.output[0]] = y.numpy() diff --git a/src/finn/custom_op/fpgadataflow/rtl/__init__.py b/src/finn/custom_op/fpgadataflow/rtl/__init__.py index 06067a4fca..6c483ba0d3 100644 --- a/src/finn/custom_op/fpgadataflow/rtl/__init__.py +++ b/src/finn/custom_op/fpgadataflow/rtl/__init__.py @@ -35,6 +35,7 @@ StreamingDataWidthConverter_rtl, ) from finn.custom_op.fpgadataflow.rtl.streamingfifo_rtl import StreamingFIFO_rtl +from finn.custom_op.fpgadataflow.rtl.pwpolyf_rtl import PWPolyF_rtl from finn.custom_op.fpgadataflow.rtl.thresholding_rtl import Thresholding_rtl from finn.custom_op.fpgadataflow.rtl.vectorvectoractivation_rtl import VVAU_rtl @@ -48,4 +49,5 @@ custom_op["StreamingFIFO_rtl"] = StreamingFIFO_rtl custom_op["MVAU_rtl"] = MVAU_rtl custom_op["VVAU_rtl"] = VVAU_rtl +custom_op["PWPolyF_rtl"] = PWPolyF_rtl custom_op["Thresholding_rtl"] = Thresholding_rtl diff --git a/src/finn/custom_op/fpgadataflow/rtl/pwpolyf_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/pwpolyf_rtl.py new file mode 100644 index 0000000000..d4736f7fee --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/rtl/pwpolyf_rtl.py @@ -0,0 +1,289 @@ +# Copyright (C) 2026, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import numpy as np +import os +import struct +from pyverilator.util.axi_utils import reset_rtlsim, rtlsim_multi_io + +from finn.custom_op.fpgadataflow.pwpolyf import PWPolyF +from finn.custom_op.fpgadataflow.rtlbackend import RTLBackend +from finn.util.basic import ( + get_rtlsim_trace_depth, + make_build_dir, + pyverilate_get_liveness_threshold_cycles, +) +from finn.util.data_packing import ( + npy_to_rtlsim_input, + rtlsim_output_to_npy, +) +from finn.util.pwpolyf import ( + NUM_OCTAVES, + SUPPORTED_FUNCS, + _fit_coefficients, +) + +try: + from pyverilator import PyVerilator +except ModuleNotFoundError: + PyVerilator = None + + +def _float_to_hex(f): + """Convert a Python float to a 32-bit IEEE 754 hex string.""" + return "%08X" % struct.unpack("!I", struct.pack("!f", float(f)))[0] + + +def generate_coeffs_svh(K, num_samples=1000): + """Generate the pwpolyf_coeffs.svh file content for a given K value.""" + num_subs = 1 << K + num_segs = 1 + 2 * NUM_OCTAVES * num_subs + + lines = [] + lines.append("// Auto-generated by pwpolyf_rtl.py — do not edit manually.") + lines.append("// K=%d, NUM_SEGS=%d, NUM_OCTAVES=%d, DEGREE=2" % (K, num_segs, NUM_OCTAVES)) + lines.append("") + lines.append("localparam int unsigned PWPOLYF_K = %d;" % K) + lines.append("localparam int unsigned PWPOLYF_NUM_SEGS = %d;" % num_segs) + lines.append("localparam int unsigned PWPOLYF_NUM_OCTAVES = %d;" % NUM_OCTAVES) + lines.append("localparam int unsigned PWPOLYF_DEGREE = 2;") + + for func_name in SUPPORTED_FUNCS: + coeffs = _fit_coefficients(func_name, K, num_samples) + label = "PWPOLYF_%s_COEFFS" % func_name.upper() + lines.append("") + lines.append("localparam logic [31:0] %s[%d][3] = '{" % (label, num_segs)) + for seg in range(num_segs): + c0 = _float_to_hex(coeffs[seg, 0]) + c1 = _float_to_hex(coeffs[seg, 1]) + c2 = _float_to_hex(coeffs[seg, 2]) + comma = "," if seg < num_segs - 1 else "" + lines.append( + " '{ 32'h%s, 32'h%s, 32'h%s }%s // seg %d" + % (c0, c1, c2, comma, seg) + ) + lines.append("};") + + lines.append("") + return "\n".join(lines) + + +class PWPolyF_rtl(PWPolyF, RTLBackend): + """RTL variant of PWPolyF, wraps the finn-rtllib pwpolyf IP.""" + + def __init__(self, onnx_node, **kwargs): + super().__init__(onnx_node, **kwargs) + + def get_nodeattr_types(self): + my_attrs = {} + my_attrs.update(PWPolyF.get_nodeattr_types(self)) + my_attrs.update(RTLBackend.get_nodeattr_types(self)) + return my_attrs + + def prepare_codegen_rtl_values(self, model): + """Build the substitution dictionary for RTL template files.""" + code_gen_dict = {} + + pe = self.get_nodeattr("PE") + func = self.get_nodeattr("func") + + code_gen_dict["$MODULE_NAME_AXI_WRAPPER$"] = [ + self.get_verilog_top_module_name() + "_axi_wrapper" + ] + code_gen_dict["$TOP_MODULE$"] = code_gen_dict["$MODULE_NAME_AXI_WRAPPER$"] + code_gen_dict["$PE$"] = [str(pe)] + code_gen_dict["$FUNC$"] = ['"%s"' % func] + code_gen_dict["$IN_WIDTH$"] = [str(pe * 32)] + code_gen_dict["$OUT_WIDTH$"] = [str(pe * 32)] + + return code_gen_dict + + def get_rtl_file_list(self): + return [ + "pwpolyf.sv", + "pwpolyf_coeffs.svh", + "queue.sv", + "pwpolyf_template_wrapper.v", + ] + + def get_rtl_file_paths(self): + rtl_root_dir = os.environ["FINN_ROOT"] + "/finn-rtllib/pwpolyf/hdl/" + rtl_file_list = self.get_rtl_file_list() + rtl_file_paths = [rtl_root_dir + f for f in rtl_file_list] + return rtl_file_paths + + def get_rtl_template_data(self, path): + with open(path, "r") as f: + template = f.read() + return template + + def fill_in_rtl_template_data(self, replace_dict, template_data): + template_data_cp = template_data + for key in replace_dict: + replacement_line = "\n".join(replace_dict[key]) + template_data_cp = template_data_cp.replace(key, replacement_line) + return template_data_cp + + def dump_rtl_data(self, dest_dir, filename, data): + if "template" in filename: + filename = self.get_nodeattr("gen_top_module") + ".v" + with open(os.path.join(dest_dir, filename), "w") as f: + f.write(data) + + def generate_hdl(self, model, fpgapart, clk): + code_gen_dict = self.prepare_codegen_rtl_values(model) + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + self.set_nodeattr("gen_top_module", code_gen_dict["$TOP_MODULE$"][0]) + + for rtl_file_path in self.get_rtl_file_paths(): + template_data = self.get_rtl_template_data(rtl_file_path) + data = self.fill_in_rtl_template_data(code_gen_dict, template_data) + file_only_path = rtl_file_path.split("/")[-1] + self.dump_rtl_data(code_gen_dir, file_only_path, data) + + # generate coefficients .svh matching the node's K value + K = self.get_nodeattr("K") + svh_data = generate_coeffs_svh(K) + with open(os.path.join(code_gen_dir, "pwpolyf_coeffs.svh"), "w") as f: + f.write(svh_data) + + self.set_nodeattr("ipgen_path", code_gen_dir) + self.set_nodeattr("ip_path", code_gen_dir) + + def prepare_rtlsim(self): + if PyVerilator is None: + raise ImportError("Installation of PyVerilator is required.") + + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + verilog_paths = [code_gen_dir] + # exclude .svh — it is pulled in via `include from pwpolyf.sv + verilog_files = [ + x.replace("pwpolyf_template_wrapper", self.get_nodeattr("gen_top_module")) + for x in self.get_rtl_file_list() + if not x.endswith(".svh") + ] + single_src_dir = make_build_dir("pyverilator_" + self.onnx_node.name + "_") + + sim = PyVerilator.build( + verilog_files, + build_dir=single_src_dir, + verilog_path=verilog_paths, + trace_depth=get_rtlsim_trace_depth(), + top_module_name=self.get_nodeattr("gen_top_module"), + auto_eval=False, + ) + + self.set_nodeattr("rtlsim_so", sim.lib._name) + return sim + + def execute_node(self, context, graph): + mode = self.get_nodeattr("exec_mode") + if mode == "cppsim": + PWPolyF.execute_node(self, context, graph) + elif mode == "rtlsim": + node = self.onnx_node + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + + expected_inp_shape = self.get_folded_input_shape() + reshaped_input = context[node.input[0]].reshape(expected_inp_shape) + export_idt = self.get_input_datatype() + reshaped_input = reshaped_input.copy() + np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input) + + sim = self.get_rtlsim() + nbits = self.get_instream_width() + inp = npy_to_rtlsim_input( + os.path.join(code_gen_dir, "input_0.npy"), export_idt, nbits + ) + io_names = self.get_verilog_top_module_intf_names() + istream_name = io_names["s_axis"][0][0] + ostream_name = io_names["m_axis"][0][0] + io_dict = { + "inputs": {istream_name: inp}, + "outputs": {ostream_name: []}, + } + + trace_file = self.get_nodeattr("rtlsim_trace") + if trace_file == "default": + trace_file = self.onnx_node.name + ".vcd" + sname = "_" + + num_out_values = self.get_number_output_values() + reset_rtlsim(sim) + total_cycle_count = rtlsim_multi_io( + sim, + io_dict, + num_out_values, + trace_file=trace_file, + sname=sname, + liveness_threshold=pyverilate_get_liveness_threshold_cycles(), + ) + self.set_nodeattr("cycles_rtlsim", total_cycle_count) + output = io_dict["outputs"][ostream_name] + + odt = self.get_output_datatype() + target_bits = odt.bitwidth() + packed_bits = self.get_outstream_width() + out_npy_path = os.path.join(code_gen_dir, "output.npy") + out_shape = self.get_folded_output_shape() + + rtlsim_output_to_npy( + output, out_npy_path, odt, out_shape, packed_bits, target_bits + ) + + output = np.load(out_npy_path) + oshape = self.get_normal_output_shape() + output = np.asarray([output], dtype=np.float32).reshape(*oshape) + context[node.output[0]] = output + else: + raise Exception( + "Invalid value for attribute exec_mode! Is currently set to: %s " + "has to be one of ('cppsim', 'rtlsim')" % mode + ) + + def code_generation_ipi(self): + rtl_file_list = [ + x.replace("pwpolyf_template_wrapper", self.get_nodeattr("gen_top_module")) + for x in self.get_rtl_file_list() + ] + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + source_target = "./ip/verilog/rtl_ops/%s" % self.onnx_node.name + cmd = ["file mkdir %s" % source_target] + + for rtl_file in rtl_file_list: + cmd.append( + "add_files -copy_to %s -norecurse %s" + % (source_target, os.path.join(code_gen_dir, rtl_file)) + ) + + cmd.append( + "create_bd_cell -type module -reference %s %s" + % (self.get_nodeattr("gen_top_module"), self.onnx_node.name) + ) + + return cmd diff --git a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py index e14181b140..c7e95b28bf 100644 --- a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py +++ b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py @@ -274,6 +274,53 @@ def apply(self, model): return (model, graph_modified) +class InferPWPolyFLayer(Transformation): + """Convert PWPolyF nodes into piecewise polynomial activation HW layers.""" + + def __init__(self): + super().__init__() + + def apply(self, model): + graph = model.graph + node_ind = 0 + graph_modified = False + for node in graph.node: + node_ind += 1 + if node.op_type == "PWPolyF" and node.domain != "finn.custom_op.fpgadataflow": + pwp_input = node.input[0] + pwp_output = node.output[0] + pwp_in_shape = model.get_tensor_shape(pwp_input) + idt = model.get_tensor_datatype(pwp_input) + + func = get_by_name(node.attribute, "func").s.decode("utf-8") + K_attr = get_by_name(node.attribute, "K") + K = K_attr.i if K_attr is not None else 3 + + num_channels = pwp_in_shape[-1] + + new_node = helper.make_node( + "PWPolyF", + [pwp_input], + [pwp_output], + domain="finn.custom_op.fpgadataflow", + backend="fpgadataflow", + func=func, + K=K, + NumChannels=num_channels, + PE=1, + inputDataType=idt.name, + outputDataType=idt.name, + numInputVectors=list(pwp_in_shape[:-1]), + name="PWPolyF_" + node.name, + ) + + graph.node.insert(node_ind, new_node) + graph.node.remove(node) + graph_modified = True + + return (model, graph_modified) + + class InferUpsample(Transformation): """Convert Upsample and Resize nodes to layers to UpsampleNearestNeighbour nodes.""" diff --git a/src/finn/transformation/fpgadataflow/set_folding.py b/src/finn/transformation/fpgadataflow/set_folding.py index eaee499e6a..07b2e89f19 100644 --- a/src/finn/transformation/fpgadataflow/set_folding.py +++ b/src/finn/transformation/fpgadataflow/set_folding.py @@ -104,6 +104,7 @@ def apply(self, model): "ChannelwiseOp_hls", "DuplicateStreams_hls", "GlobalAccPool_hls", + "PWPolyF_rtl", "Thresholding_hls", "Thresholding_rtl", ] diff --git a/src/finn/util/pwpolyf.py b/src/finn/util/pwpolyf.py new file mode 100644 index 0000000000..484cfde85c --- /dev/null +++ b/src/finn/util/pwpolyf.py @@ -0,0 +1,236 @@ +# Copyright (C) 2026, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +""" +Piecewise polynomial activation - PyTorch module and software model. + +Drop-in activation that approximates GELU, SiLU, Sigmoid, and Tanh using +degree-2 polynomials, matching the pwpolyf RTL behaviour. Emits a single +PWPolyF custom op node during ONNX export (requires dynamo=False). +""" + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F + +# Constants matching the SystemVerilog module +NUM_OCTAVES = 5 +EXP_BIAS = 127 +EXP_BASE = 125 +EXP_CLAMP = 130 + +SUPPORTED_FUNCS = ("gelu", "silu", "sigmoid", "tanh") + +REFERENCE_FUNCS = { + "gelu": lambda x: F.gelu(x), + "silu": lambda x: F.silu(x), + "sigmoid": lambda x: torch.sigmoid(x), + "tanh": lambda x: torch.tanh(x), +} + +CLAMP_CFG = { + "gelu": {"neg_clamp": 0.0, "pos_clamp": 0.0, "pos_passthrough": True}, + "silu": {"neg_clamp": 0.0, "pos_clamp": 0.0, "pos_passthrough": True}, + "sigmoid": {"neg_clamp": 0.0, "pos_clamp": 1.0, "pos_passthrough": False}, + "tanh": {"neg_clamp": -1.0, "pos_clamp": 1.0, "pos_passthrough": False}, +} + + +def _segment_boundaries(K): + """Return (lo, hi) bounds for every segment.""" + num_subs = 1 << K + bounds = [] + + # Segment 0: near-zero + bounds.append((-0.25, 0.25)) + + # Positive segments + for octave in range(NUM_OCTAVES): + exp_val = EXP_BASE + octave - EXP_BIAS + base = 2.0 ** exp_val + for sub in range(num_subs): + lo = base * (1.0 + sub / num_subs) + hi = base * (1.0 + (sub + 1) / num_subs) + bounds.append((lo, hi)) + + # Negative segments (mirror of positive) + for octave in range(NUM_OCTAVES): + exp_val = EXP_BASE + octave - EXP_BIAS + base = 2.0 ** exp_val + for sub in range(num_subs): + lo = base * (1.0 + sub / num_subs) + hi = base * (1.0 + (sub + 1) / num_subs) + bounds.append((-hi, -lo)) + + return bounds + + +def _fit_coefficients(func_name, K, num_samples=1000): + """Fit degree-2 polynomials per segment. Returns (NUM_SEGS, 3) tensor.""" + ref_fn = REFERENCE_FUNCS[func_name] + bounds = _segment_boundaries(K) + num_segs = len(bounds) + coeffs = np.zeros((num_segs, 3), dtype=np.float64) + + for seg, (lo, hi) in enumerate(bounds): + xs = np.linspace(lo, hi, num_samples, dtype=np.float64) + with torch.no_grad(): + ys = ref_fn(torch.from_numpy(xs).float()).numpy().astype(np.float64) + c = np.polynomial.polynomial.polyfit(xs, ys, deg=2) + coeffs[seg] = c[:3] + + return torch.from_numpy(coeffs.astype(np.float32)) + + +def _segment_index(x, K, num_subs, num_segs): + """Map each element to its polynomial segment, mirroring SV addressing.""" + abs_x = x.abs() + is_neg = x < 0 + + is_near_zero = abs_x < 0.25 + is_clamp = abs_x >= 8.0 + is_neg_clamp = is_neg & is_clamp + is_pos_clamp = (~is_neg) & is_clamp + + safe_abs = abs_x.clamp(min=0.25) + floor_log2 = torch.floor(torch.log2(safe_abs)) + octave = (floor_log2 + 2).long().clamp(0, NUM_OCTAVES - 1) + + pow2 = torch.exp2(floor_log2) + frac = safe_abs / pow2 - 1.0 + sub = (frac * num_subs).long().clamp(0, num_subs - 1) + + pos_idx = 1 + octave * num_subs + sub + neg_idx = 1 + NUM_OCTAVES * num_subs + octave * num_subs + sub + + seg_idx = torch.where( + is_near_zero, + torch.zeros_like(pos_idx), + torch.where(is_neg, neg_idx, pos_idx), + ) + seg_idx = seg_idx.clamp(0, num_segs - 1) + + return seg_idx, is_neg_clamp, is_pos_clamp + + +class PWPolyFFunction(torch.autograd.Function): + """Emits a single PWPolyF ONNX node during export.""" + + @staticmethod + def forward(ctx, x, coeffs, neg_clamp_val, pos_clamp_val, func, K): + num_subs = 1 << K + num_segs = 1 + 2 * NUM_OCTAVES * num_subs + pos_passthrough = CLAMP_CFG[func]["pos_passthrough"] + + orig_shape = x.shape + x_flat = x.contiguous().view(-1) + + seg_idx, is_neg_clamp, is_pos_clamp = _segment_index( + x_flat, K, num_subs, num_segs + ) + + c = coeffs[seg_idx] + a0 = c[:, 0] + a1 = c[:, 1] + a2 = c[:, 2] + + y = a0 + x_flat * (a1 + a2 * x_flat) + + if pos_passthrough: + pos_val = x_flat + else: + pos_val = pos_clamp_val.expand_as(y) + y = torch.where(is_pos_clamp, pos_val, y) + y = torch.where(is_neg_clamp, neg_clamp_val.expand_as(y), y) + + return y.view(orig_shape) + + @staticmethod + def symbolic(g, x, coeffs, neg_clamp_val, pos_clamp_val, func, K): + return g.op("PWPolyF", x, func_s=func, K_i=K) + + +class PiecewisePolyActivation(nn.Module): + """ + Drop-in activation matching the pwpolyf hardware behaviour. + + Approximates nonlinear activations using degree-2 polynomials over + segments defined by FP32 bit-extraction. Evaluated via Horner's method. + Emits a single PWPolyF custom op node during ONNX export. + """ + + def __init__(self, func="gelu", K=3, fit_samples=1000): + super().__init__() + if func not in SUPPORTED_FUNCS: + raise ValueError("Unsupported func=%r; choose from %s" % (func, SUPPORTED_FUNCS)) + + self.func = func + self.K = K + self.num_subs = 1 << K + self.num_segs = 1 + 2 * NUM_OCTAVES * self.num_subs + self.pos_passthrough = CLAMP_CFG[func]["pos_passthrough"] + + coeffs = _fit_coefficients(func, K, fit_samples) + self.register_buffer("coeffs", coeffs) + + neg_cv = torch.tensor(CLAMP_CFG[func]["neg_clamp"], dtype=torch.float32) + pos_cv = torch.tensor(CLAMP_CFG[func]["pos_clamp"], dtype=torch.float32) + self.register_buffer("neg_clamp_val", neg_cv) + self.register_buffer("pos_clamp_val", pos_cv) + + def forward(self, x): + if torch.onnx.is_in_onnx_export(): + return PWPolyFFunction.apply( + x, self.coeffs, self.neg_clamp_val, self.pos_clamp_val, + self.func, self.K, + ) + + orig_shape = x.shape + x_flat = x.contiguous().view(-1) + + seg_idx, is_neg_clamp, is_pos_clamp = _segment_index( + x_flat, self.K, self.num_subs, self.num_segs + ) + + c = self.coeffs[seg_idx] + a0 = c[:, 0] + a1 = c[:, 1] + a2 = c[:, 2] + + # Horner: y = a0 + x*(a1 + a2*x) + y = a0 + x_flat * (a1 + a2 * x_flat) + + if self.pos_passthrough: + pos_val = x_flat + else: + pos_val = self.pos_clamp_val.expand_as(y) + y = torch.where(is_pos_clamp, pos_val, y) + y = torch.where(is_neg_clamp, self.neg_clamp_val.expand_as(y), y) + + return y.view(orig_shape) diff --git a/tests/fpgadataflow/test_fpgadataflow_pwpolyf.py b/tests/fpgadataflow/test_fpgadataflow_pwpolyf.py new file mode 100644 index 0000000000..5a892396ff --- /dev/null +++ b/tests/fpgadataflow/test_fpgadataflow_pwpolyf.py @@ -0,0 +1,281 @@ +# Copyright (C) 2026, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import pytest + +import numpy as np +import os +import tempfile +import torch +from onnx import TensorProto, helper +from qonnx.core.modelwrapper import ModelWrapper +from qonnx.custom_op.registry import getCustomOp +from qonnx.transformation.general import GiveUniqueNodeNames +from qonnx.transformation.infer_datatypes import InferDataTypes +from qonnx.transformation.infer_shapes import InferShapes + +import finn.core.onnx_exec as oxe +from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer +from finn.util.pwpolyf import PiecewisePolyActivation +from finn.transformation.fpgadataflow.convert_to_hw_layers import InferPWPolyFLayer +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers + +test_fpga_part = "xczu3eg-sbva484-1-e" + + +def make_pwpolyf_modelwrapper(func, K, num_channels, num_input_vecs): + inp = helper.make_tensor_value_info( + "inp", TensorProto.FLOAT, num_input_vecs + [num_channels] + ) + outp = helper.make_tensor_value_info( + "outp", TensorProto.FLOAT, num_input_vecs + [num_channels] + ) + + pwpolyf_node = helper.make_node( + "PWPolyF", + ["inp"], + ["outp"], + domain="finn.custom_op.fpgadataflow", + backend="fpgadataflow", + func=func, + K=K, + NumChannels=num_channels, + PE=1, + inputDataType="FLOAT32", + outputDataType="FLOAT32", + numInputVectors=num_input_vecs, + name="PWPolyF_0", + ) + + graph = helper.make_graph( + nodes=[pwpolyf_node], + name="pwpolyf_graph", + inputs=[inp], + outputs=[outp], + ) + model = helper.make_model(graph, producer_name="pwpolyf-test") + model = ModelWrapper(model) + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) + model = model.transform(GiveUniqueNodeNames()) + return model + + +@pytest.mark.parametrize("func", ["gelu", "silu", "sigmoid", "tanh"]) +@pytest.mark.parametrize("num_channels", [4, 16]) +@pytest.mark.parametrize("num_input_vecs", [[1], [1, 2, 2]]) +@pytest.mark.parametrize("fold", [-1, 1, 2]) +@pytest.mark.fpgadataflow +def test_pwpolyf_cppsim(func, num_channels, num_input_vecs, fold): + K = 3 + if fold == -1: + fold = num_channels + pe = num_channels // fold + if num_channels % pe != 0: + pytest.skip("Invalid folding configuration.") + + model = make_pwpolyf_modelwrapper(func, K, num_channels, num_input_vecs) + node = model.graph.node[0] + inst = getCustomOp(node) + inst.set_nodeattr("PE", pe) + + input_shape = tuple(num_input_vecs + [num_channels]) + x = np.random.uniform(-10, 10, input_shape).astype(np.float32) + + ref_mod = PiecewisePolyActivation(func, K=K) + with torch.no_grad(): + y_expected = ref_mod(torch.from_numpy(x)).numpy() + + input_dict = {"inp": x} + y_produced = oxe.execute_onnx(model, input_dict)["outp"] + + assert y_produced.shape == y_expected.shape + assert np.allclose(y_produced, y_expected, atol=1e-6) + + +@pytest.mark.parametrize("func", ["gelu", "silu", "sigmoid", "tanh"]) +@pytest.mark.fpgadataflow +def test_pwpolyf_onnx_export(func): + K = 3 + num_channels = 32 + mod = PiecewisePolyActivation(func, K=K) + mod.eval() + dummy = torch.randn(1, num_channels) + + with tempfile.NamedTemporaryFile(suffix=".onnx", delete=False) as f: + tmpf = f.name + try: + torch.onnx.export( + mod, dummy, tmpf, + input_names=["input"], output_names=["output"], + opset_version=13, dynamo=False, + ) + import onnx + onnx_model = onnx.load(tmpf) + finally: + os.unlink(tmpf) + + pwp_nodes = [n for n in onnx_model.graph.node if n.op_type == "PWPolyF"] + assert len(pwp_nodes) == 1 + node = pwp_nodes[0] + func_attr = {a.name: a for a in node.attribute} + assert func_attr["func"].s.decode("utf-8") == func + assert func_attr["K"].i == K + + +@pytest.mark.parametrize("func", ["gelu", "sigmoid"]) +@pytest.mark.fpgadataflow +def test_pwpolyf_infer_transform(func): + K = 3 + num_channels = 16 + mod = PiecewisePolyActivation(func, K=K) + mod.eval() + dummy = torch.randn(1, num_channels) + + with tempfile.NamedTemporaryFile(suffix=".onnx", delete=False) as f: + tmpf = f.name + try: + torch.onnx.export( + mod, dummy, tmpf, + input_names=["inp"], output_names=["outp"], + opset_version=13, dynamo=False, + ) + model = ModelWrapper(tmpf) + finally: + os.unlink(tmpf) + + node = model.graph.node[0] + assert node.op_type == "PWPolyF" + assert node.domain != "finn.custom_op.fpgadataflow" + + model = model.transform(InferPWPolyFLayer()) + + node = model.graph.node[0] + assert node.op_type == "PWPolyF" + assert node.domain == "finn.custom_op.fpgadataflow" + + inst = getCustomOp(node) + assert inst.get_nodeattr("func") == func + assert inst.get_nodeattr("K") == K + assert inst.get_nodeattr("NumChannels") == num_channels + assert inst.get_nodeattr("PE") == 1 + assert inst.get_nodeattr("inputDataType") == "FLOAT32" + + x = np.random.uniform(-5, 5, (1, num_channels)).astype(np.float32) + input_dict = {"inp": x} + y_produced = oxe.execute_onnx(model, input_dict)["outp"] + + ref_mod = PiecewisePolyActivation(func, K=K) + with torch.no_grad(): + y_expected = ref_mod(torch.from_numpy(x)).numpy() + assert np.allclose(y_produced, y_expected, atol=1e-6) + + +@pytest.mark.parametrize("func", ["gelu", "silu", "sigmoid", "tanh"]) +@pytest.mark.fpgadataflow +def test_pwpolyf_specialize_rtl(func): + K = 3 + num_channels = 8 + model = make_pwpolyf_modelwrapper(func, K, num_channels, [1]) + model = model.transform(SpecializeLayers(test_fpga_part)) + + node = model.graph.node[0] + assert node.op_type == "PWPolyF_rtl" + assert node.domain == "finn.custom_op.fpgadataflow.rtl" + + inst = getCustomOp(node) + assert inst.get_nodeattr("func") == func + assert inst.get_nodeattr("K") == K + + +@pytest.mark.parametrize("func", ["gelu", "tanh"]) +@pytest.mark.parametrize("pe", [1, 2, 4]) +@pytest.mark.fpgadataflow +def test_pwpolyf_resource_estimates(func, pe): + K = 3 + num_channels = 8 + model = make_pwpolyf_modelwrapper(func, K, num_channels, [1]) + node = model.graph.node[0] + inst = getCustomOp(node) + inst.set_nodeattr("PE", pe) + + assert inst.dsp_estimation() == 2 * pe + assert inst.lut_estimation() == 200 * pe + assert inst.bram_estimation() == 0 + assert inst.uram_estimation() == 0 + + +@pytest.mark.parametrize("func", ["gelu", "sigmoid"]) +@pytest.mark.fpgadataflow +def test_pwpolyf_folded_shape(func): + K = 3 + num_channels = 12 + num_input_vecs = [1, 3, 3] + model = make_pwpolyf_modelwrapper(func, K, num_channels, num_input_vecs) + node = model.graph.node[0] + inst = getCustomOp(node) + + # PE=1 + assert inst.get_normal_input_shape() == (1, 3, 3, 12) + assert inst.get_normal_output_shape() == (1, 3, 3, 12) + assert inst.get_folded_input_shape() == (1, 3, 3, 12, 1) + assert inst.get_folded_output_shape() == (1, 3, 3, 12, 1) + + # PE=4 + inst.set_nodeattr("PE", 4) + assert inst.get_folded_input_shape() == (1, 3, 3, 3, 4) + assert inst.get_folded_output_shape() == (1, 3, 3, 3, 4) + assert inst.get_instream_width() == 4 * 32 + assert inst.get_outstream_width() == 4 * 32 + + +@pytest.mark.parametrize("func", ["gelu", "silu"]) +@pytest.mark.fpgadataflow +def test_pwpolyf_exp_cycles(func): + """Verify expected cycle count estimation.""" + K = 3 + num_channels = 8 + pe = 2 + num_input_vecs = [1, 4, 4] + model = make_pwpolyf_modelwrapper(func, K, num_channels, num_input_vecs) + node = model.graph.node[0] + inst = getCustomOp(node) + inst.set_nodeattr("PE", pe) + + # folded shape = (1, 4, 4, 4, 2), exp_cycles = prod of all but last = 1*4*4*4 = 64 + exp = inst.get_exp_cycles() + assert exp == 1 * 4 * 4 * (num_channels // pe) + + # exp_cycles_per_layer analysis only runs on specialized (rtl/hls) nodes + model = model.transform(SpecializeLayers(test_fpga_part)) + node = model.graph.node[0] + inst = getCustomOp(node) + inst.set_nodeattr("PE", pe) + exp_dict = model.analysis(exp_cycles_per_layer) + assert node.name in exp_dict + assert exp_dict[node.name] == exp From abecad30f9c2e2aa618a2841f8bfcf0abcf7c0e2 Mon Sep 17 00:00:00 2001 From: ollycassidy13 Date: Thu, 23 Apr 2026 09:04:03 +0100 Subject: [PATCH 02/12] nn.act detection and dynamo=True --- docs/finn/pwpolyf.md | 79 ++++-- .../fpgadataflow/convert_to_hw_layers.py | 182 ++++++++++-- .../fpgadataflow/test_fpgadataflow_pwpolyf.py | 267 ++++++++++++++++++ 3 files changed, 491 insertions(+), 37 deletions(-) diff --git a/docs/finn/pwpolyf.md b/docs/finn/pwpolyf.md index c155470bae..a2d6544e04 100644 --- a/docs/finn/pwpolyf.md +++ b/docs/finn/pwpolyf.md @@ -13,7 +13,7 @@ K=3 this gives 81 segments. Segment selection reuses the FP32 exponent/mantissa bit-fields directly, matching the RTL implementation. Polynomial coefficients are generated at HDL build time by -`generate_coeffs_svh()` in `pwpolyf_sim.py`, which fits degree-2 polynomials +`generate_coeffs_svh()` in `pwpolyf_rtl.py`, which fits degree-2 polynomials to the reference PyTorch functions and writes the `pwpolyf_coeffs.svh` header. This ensures the RTL coefficients always match the configured K value. @@ -22,24 +22,53 @@ This ensures the RTL coefficients always match the configured K value. ## Architecture -PWPolyF is **RTL-only** (no HLS variant). The pipeline is: +PWPolyF is **RTL-only** (no HLS variant). Two export paths are supported: ``` -PiecewisePolyActivation (PyTorch) - | torch.onnx.export (dynamo=False) - v -PWPolyF ONNX node - | InferPWPolyFLayer - v -PWPolyF HW op (finn.custom_op.fpgadataflow) - | SpecializeLayers - v -PWPolyF_rtl (finn.custom_op.fpgadataflow.rtl) - | generate_hdl - v -finn-rtllib/pwpolyf/hdl/ SystemVerilog IP +Path A: PiecewisePolyActivation Path B: nn.GELU / nn.SiLU / etc. + | torch.onnx.export | torch.onnx.export + | (dynamo=False) | (dynamo=True or False) + v v +PWPolyF custom ONNX node Standard ONNX ops (Gelu, Sigmoid, + | Tanh, Sigmoid+Mul for SiLU, + | Div+Erf+Add+Mul+Mul for GELU) + | | + +------------- both paths -------------+ + | + InferPWPolyFLayer + v + PWPolyF HW op (finn.custom_op.fpgadataflow) + | SpecializeLayers + v + PWPolyF_rtl (finn.custom_op.fpgadataflow.rtl) + | generate_hdl + v + finn-rtllib/pwpolyf/hdl/ SystemVerilog IP ``` +### Standard ONNX op inference + +`InferPWPolyFLayer` recognises standard ONNX activation ops in addition to +the explicit `PWPolyF` custom op. This allows models that use `nn.GELU`, +`nn.SiLU`, `nn.Sigmoid`, or `nn.Tanh` to be exported with `dynamo=True` +(or `dynamo=False`) and automatically converted to PWPolyF HW layers. + +| ONNX op type | Pattern | Maps to | +|---|---|---| +| `Gelu` (opset 20+) | Single node | `func="gelu"` | +| `Div`+`Erf`+`Add`+`Mul`+`Mul` | `x * 0.5 * (1 + erf(x / sqrt(2)))` | `func="gelu"` | +| `Sigmoid` | Single node (standalone) | `func="sigmoid"` | +| `Tanh` | Single node | `func="tanh"` | +| `Sigmoid` + `Mul` | `Mul(x, Sigmoid(x))` | `func="silu"` | + +Notes: +- `Gelu` as a single ONNX node requires opset 20 or later. With lower + opsets (including `dynamo=True` which defaults to opset 18), GELU + decomposes into a 5-node Erf-based pattern. Both forms are matched. +- SiLU (`nn.SiLU`) has no standard ONNX op; it decomposes to + `Sigmoid(x) * x`. The transformation detects this two-node pattern. +- Only FLOAT32 inputs are converted. Quantised activations are skipped. + ## Folding PWPolyF uses PE parallelism. `NumChannels % PE == 0` must hold. @@ -59,11 +88,17 @@ Each PE instantiates its own polynomial evaluation pipeline (2 DSPs). ## ONNX export -`PiecewisePolyActivation` exports as a single `PWPolyF` custom op via -`torch.autograd.Function.symbolic()`. Requires the legacy TorchScript exporter -(`dynamo=False` in `torch.onnx.export`). +Two export paths are supported: + +1. **`PiecewisePolyActivation` (explicit)** — exports as a single `PWPolyF` + custom op via `torch.autograd.Function.symbolic()`. Requires + `dynamo=False`. Preserves the `K` attribute on the ONNX node. + +2. **Standard nn modules** (`nn.GELU`, `nn.SiLU`, `nn.Sigmoid`, `nn.Tanh`) — + export with `dynamo=True` or `dynamo=False`. Produces standard ONNX ops + that `InferPWPolyFLayer` converts to PWPolyF with default `K=3`. -Attributes on the ONNX node: +Attributes on the explicit PWPolyF ONNX node: - `func` (string): one of `gelu`, `silu`, `sigmoid`, `tanh` - `K` (int): mantissa subdivision bits (default 3) @@ -112,11 +147,15 @@ Attributes on the ONNX node: ## Tests -`tests/fpgadataflow/test_fpgadataflow_pwpolyf.py` — 68 parametrized tests: +`tests/fpgadataflow/test_fpgadataflow_pwpolyf.py`: - **cppsim**: all 4 functions x 2 channel counts x 2 spatial shapes x 3 foldings - **ONNX export**: verifies single-node export for all functions - **InferPWPolyFLayer**: end-to-end export → transform → execute +- **Standard op inference**: Gelu/Sigmoid/Tanh single-node + SiLU pattern +- **Erf-based GELU inference**: 5-node Erf decomposition pattern matching + execution +- **SiLU edge cases**: reversed Mul input order, multi-consumer Sigmoid +- **Execution correctness**: standard ops produce same output as PiecewisePolyActivation - **SpecializeLayers**: verifies RTL specialization - **Resource estimates**: DSP/LUT/BRAM checks across PE values - **Folded shapes**: input/output/stream width calculations diff --git a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py index c7e95b28bf..2427a4514a 100644 --- a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py +++ b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py @@ -275,17 +275,99 @@ def apply(self, model): class InferPWPolyFLayer(Transformation): - """Convert PWPolyF nodes into piecewise polynomial activation HW layers.""" + """Convert PWPolyF custom ops and standard ONNX activations (Gelu, Sigmoid, + Tanh, SiLU pattern) into piecewise polynomial HW layers.""" + + _SINGLE_OP_MAP = {"Gelu": "gelu", "Tanh": "tanh"} def __init__(self): super().__init__() + @staticmethod + def _is_const_scalar(model, tensor_name, value, tol=1e-3): + """Check if *tensor_name* is a constant initializer equal to *value*.""" + init = model.get_initializer(tensor_name) + if init is None: + return False + return init.size == 1 and abs(float(init.flat[0]) - value) < tol + + def _match_erf_gelu(self, model, erf_node): + """Try to match the Erf-based GELU decomposition rooted at *erf_node*. + + Pattern (opset < 20): + Div(x, sqrt(2)) → Erf → Add(_, 1) → Mul(0.5, _) → Mul(x, _) + + Returns (pwp_input, pwp_output, nodes_to_remove) on success, else None. + """ + # --- backward: Erf input must come from Div(x, sqrt(2)) --- + div_node = model.find_producer(erf_node.input[0]) + if div_node is None or div_node.op_type != "Div": + return None + # one Div input is x, the other is sqrt(2) ≈ 1.4142 + if self._is_const_scalar(model, div_node.input[1], 1.4142135): + gelu_input = div_node.input[0] + elif self._is_const_scalar(model, div_node.input[0], 1.4142135): + gelu_input = div_node.input[1] + else: + return None + + # --- forward: Erf → Add(_, 1) --- + erf_consumers = model.find_consumers(erf_node.output[0]) + if len(erf_consumers) != 1 or erf_consumers[0].op_type != "Add": + return None + add_node = erf_consumers[0] + other_add = [i for i in add_node.input if i != erf_node.output[0]] + if len(other_add) != 1 or not self._is_const_scalar(model, other_add[0], 1.0): + return None + + # --- Add → Mul(0.5, _) --- + add_consumers = model.find_consumers(add_node.output[0]) + if len(add_consumers) != 1 or add_consumers[0].op_type != "Mul": + return None + mul_half = add_consumers[0] + other_mul_half = [i for i in mul_half.input if i != add_node.output[0]] + if len(other_mul_half) != 1 or not self._is_const_scalar(model, other_mul_half[0], 0.5): + return None + + # --- Mul(0.5,_) → Mul(x, _) --- + half_consumers = model.find_consumers(mul_half.output[0]) + if len(half_consumers) != 1 or half_consumers[0].op_type != "Mul": + return None + mul_x = half_consumers[0] + other_mul_x = [i for i in mul_x.input if i != mul_half.output[0]] + if len(other_mul_x) != 1 or other_mul_x[0] != gelu_input: + return None + + nodes_to_remove = [div_node, erf_node, add_node, mul_half, mul_x] + return (gelu_input, mul_x.output[0], nodes_to_remove) + + @staticmethod + def _make_pwpolyf_node(pwp_input, pwp_output, func, in_shape, idt, name, K=3): + num_channels = in_shape[-1] + return helper.make_node( + "PWPolyF", + [pwp_input], + [pwp_output], + domain="finn.custom_op.fpgadataflow", + backend="fpgadataflow", + func=func, + K=K, + NumChannels=num_channels, + PE=1, + inputDataType=idt.name, + outputDataType=idt.name, + numInputVectors=list(in_shape[:-1]), + name=name, + ) + def apply(self, model): graph = model.graph node_ind = 0 graph_modified = False for node in graph.node: node_ind += 1 + + # Case 1: PWPolyF custom op (dynamo=False export path) if node.op_type == "PWPolyF" and node.domain != "finn.custom_op.fpgadataflow": pwp_input = node.input[0] pwp_output = node.output[0] @@ -296,28 +378,94 @@ def apply(self, model): K_attr = get_by_name(node.attribute, "K") K = K_attr.i if K_attr is not None else 3 - num_channels = pwp_in_shape[-1] - - new_node = helper.make_node( - "PWPolyF", - [pwp_input], - [pwp_output], - domain="finn.custom_op.fpgadataflow", - backend="fpgadataflow", - func=func, - K=K, - NumChannels=num_channels, - PE=1, - inputDataType=idt.name, - outputDataType=idt.name, - numInputVectors=list(pwp_in_shape[:-1]), - name="PWPolyF_" + node.name, + new_node = self._make_pwpolyf_node( + pwp_input, pwp_output, func, pwp_in_shape, idt, + "PWPolyF_" + node.name, K, ) + graph.node.insert(node_ind, new_node) + graph.node.remove(node) + graph_modified = True + + # Case 2: single-node standard ONNX activations (Gelu, Tanh) + elif node.op_type in self._SINGLE_OP_MAP: + pwp_input = node.input[0] + pwp_output = node.output[0] + pwp_in_shape = model.get_tensor_shape(pwp_input) + if pwp_in_shape is None or len(pwp_in_shape) < 1: + continue + idt = model.get_tensor_datatype(pwp_input) + if idt != DataType["FLOAT32"]: + continue + func = self._SINGLE_OP_MAP[node.op_type] + new_node = self._make_pwpolyf_node( + pwp_input, pwp_output, func, pwp_in_shape, idt, + "PWPolyF_" + node.name, + ) graph.node.insert(node_ind, new_node) graph.node.remove(node) graph_modified = True + # Case 3: Sigmoid — standalone or part of SiLU pattern + elif node.op_type == "Sigmoid": + sig_input = node.input[0] + sig_output = node.output[0] + pwp_in_shape = model.get_tensor_shape(sig_input) + if pwp_in_shape is None or len(pwp_in_shape) < 1: + continue + idt = model.get_tensor_datatype(sig_input) + if idt != DataType["FLOAT32"]: + continue + + nodes_to_remove = [node] + func = "sigmoid" + pwp_output = sig_output + + # Probe for SiLU: Sigmoid feeds a Mul whose other input + # is the same tensor x that enters the Sigmoid. + sig_consumers = model.find_consumers(sig_output) + if len(sig_consumers) == 1: + mul_cand = sig_consumers[0] + if mul_cand.op_type == "Mul": + mul_inputs = list(mul_cand.input) + other_idx = 1 if mul_inputs[0] == sig_output else 0 + if mul_inputs[other_idx] == sig_input: + func = "silu" + pwp_output = mul_cand.output[0] + nodes_to_remove.append(mul_cand) + + new_node = self._make_pwpolyf_node( + sig_input, pwp_output, func, pwp_in_shape, idt, + "PWPolyF_" + node.name, + ) + graph.node.insert(node_ind, new_node) + for nd in nodes_to_remove: + graph.node.remove(nd) + graph_modified = True + + # Case 4: Erf-based GELU (dynamo=True / opset < 20) + # Div(x, sqrt(2)) → Erf → Add(_, 1) → Mul(0.5, _) → Mul(x, _) + elif node.op_type == "Erf": + match = self._match_erf_gelu(model, node) + if match is None: + continue + pwp_input, pwp_output, nodes_to_remove = match + pwp_in_shape = model.get_tensor_shape(pwp_input) + if pwp_in_shape is None or len(pwp_in_shape) < 1: + continue + idt = model.get_tensor_datatype(pwp_input) + if idt != DataType["FLOAT32"]: + continue + + new_node = self._make_pwpolyf_node( + pwp_input, pwp_output, "gelu", pwp_in_shape, idt, + "PWPolyF_" + node.name, + ) + graph.node.insert(node_ind, new_node) + for nd in nodes_to_remove: + graph.node.remove(nd) + graph_modified = True + return (model, graph_modified) diff --git a/tests/fpgadataflow/test_fpgadataflow_pwpolyf.py b/tests/fpgadataflow/test_fpgadataflow_pwpolyf.py index 5a892396ff..e491d82eba 100644 --- a/tests/fpgadataflow/test_fpgadataflow_pwpolyf.py +++ b/tests/fpgadataflow/test_fpgadataflow_pwpolyf.py @@ -279,3 +279,270 @@ def test_pwpolyf_exp_cycles(func): exp_dict = model.analysis(exp_cycles_per_layer) assert node.name in exp_dict assert exp_dict[node.name] == exp + + +# ---------- helpers for standard ONNX op inference tests ---------- + + +def make_standard_activation_model(op_type, num_channels, num_input_vecs): + """Build an ONNX model with a single standard activation op.""" + shape = num_input_vecs + [num_channels] + inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, shape) + outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, shape) + + act_node = helper.make_node(op_type, ["inp"], ["outp"], name=op_type + "_0") + graph = helper.make_graph([act_node], "test_graph", [inp], [outp]) + model = helper.make_model(graph, producer_name="test") + model.opset_import[0].version = 20 + model = ModelWrapper(model) + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) + return model + + +def make_silu_pattern_model(num_channels, num_input_vecs): + """Build ONNX model with Sigmoid + Mul pattern (SiLU).""" + shape = num_input_vecs + [num_channels] + inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, shape) + outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, shape) + sig_out = helper.make_tensor_value_info("sig_out", TensorProto.FLOAT, shape) + + sigmoid_node = helper.make_node("Sigmoid", ["inp"], ["sig_out"], name="Sigmoid_0") + mul_node = helper.make_node("Mul", ["inp", "sig_out"], ["outp"], name="Mul_0") + + graph = helper.make_graph( + [sigmoid_node, mul_node], "silu_graph", [inp], [outp], + ) + model = helper.make_model(graph, producer_name="test") + model = ModelWrapper(model) + model.graph.value_info.append(sig_out) + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) + return model + + +def make_erf_gelu_model(num_channels, num_input_vecs): + """Build ONNX model with the Erf-based GELU decomposition. + + Pattern: x * 0.5 * (1 + erf(x / sqrt(2))) + Nodes: Div(x, sqrt(2)) -> Erf -> Add(_, 1) -> Mul(0.5, _) -> Mul(x, _) + """ + shape = num_input_vecs + [num_channels] + inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, shape) + outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, shape) + + sqrt2 = helper.make_tensor("sqrt2", TensorProto.FLOAT, [], [np.float32(np.sqrt(2))]) + one = helper.make_tensor("one", TensorProto.FLOAT, [], [np.float32(1.0)]) + half = helper.make_tensor("half", TensorProto.FLOAT, [], [np.float32(0.5)]) + + div_node = helper.make_node("Div", ["inp", "sqrt2"], ["div_out"], name="Div_0") + erf_node = helper.make_node("Erf", ["div_out"], ["erf_out"], name="Erf_0") + add_node = helper.make_node("Add", ["erf_out", "one"], ["add_out"], name="Add_0") + mul_half_node = helper.make_node("Mul", ["half", "add_out"], ["mul_half_out"], name="Mul_0") + mul_x_node = helper.make_node("Mul", ["inp", "mul_half_out"], ["outp"], name="Mul_1") + + graph = helper.make_graph( + [div_node, erf_node, add_node, mul_half_node, mul_x_node], + "erf_gelu_graph", [inp], [outp], + initializer=[sqrt2, one, half], + ) + model = helper.make_model(graph, producer_name="test") + model = ModelWrapper(model) + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) + return model + + +# ---------- standard ONNX op inference tests ---------- + + +@pytest.mark.parametrize("op_type,expected_func", [ + ("Gelu", "gelu"), + ("Sigmoid", "sigmoid"), + ("Tanh", "tanh"), +]) +@pytest.mark.parametrize("num_channels", [4, 16]) +@pytest.mark.parametrize("num_input_vecs", [[1], [1, 2, 2]]) +@pytest.mark.fpgadataflow +def test_pwpolyf_infer_standard_op(op_type, expected_func, + num_channels, num_input_vecs): + model = make_standard_activation_model(op_type, num_channels, num_input_vecs) + + assert model.graph.node[0].op_type == op_type + + model = model.transform(InferPWPolyFLayer()) + + assert len(model.graph.node) == 1 + node = model.graph.node[0] + assert node.op_type == "PWPolyF" + assert node.domain == "finn.custom_op.fpgadataflow" + + inst = getCustomOp(node) + assert inst.get_nodeattr("func") == expected_func + assert inst.get_nodeattr("K") == 3 + assert inst.get_nodeattr("NumChannels") == num_channels + assert inst.get_nodeattr("PE") == 1 + assert inst.get_nodeattr("inputDataType") == "FLOAT32" + + +@pytest.mark.parametrize("num_channels", [4, 16]) +@pytest.mark.parametrize("num_input_vecs", [[1], [1, 2, 2]]) +@pytest.mark.fpgadataflow +def test_pwpolyf_infer_silu_pattern(num_channels, num_input_vecs): + model = make_silu_pattern_model(num_channels, num_input_vecs) + + assert len(model.graph.node) == 2 + assert model.graph.node[0].op_type == "Sigmoid" + assert model.graph.node[1].op_type == "Mul" + + model = model.transform(InferPWPolyFLayer()) + + assert len(model.graph.node) == 1 + node = model.graph.node[0] + assert node.op_type == "PWPolyF" + assert node.domain == "finn.custom_op.fpgadataflow" + + inst = getCustomOp(node) + assert inst.get_nodeattr("func") == "silu" + assert inst.get_nodeattr("K") == 3 + assert inst.get_nodeattr("NumChannels") == num_channels + + +@pytest.mark.fpgadataflow +def test_pwpolyf_infer_silu_reversed_mul_inputs(): + """SiLU detection works regardless of Mul input order.""" + num_channels = 8 + shape = [1, num_channels] + inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, shape) + outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, shape) + sig_out = helper.make_tensor_value_info("sig_out", TensorProto.FLOAT, shape) + + sigmoid_node = helper.make_node("Sigmoid", ["inp"], ["sig_out"], name="Sigmoid_0") + mul_node = helper.make_node("Mul", ["sig_out", "inp"], ["outp"], name="Mul_0") + + graph = helper.make_graph([sigmoid_node, mul_node], "silu_graph", [inp], [outp]) + model = helper.make_model(graph, producer_name="test") + model = ModelWrapper(model) + model.graph.value_info.append(sig_out) + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) + + model = model.transform(InferPWPolyFLayer()) + + assert len(model.graph.node) == 1 + inst = getCustomOp(model.graph.node[0]) + assert inst.get_nodeattr("func") == "silu" + + +@pytest.mark.fpgadataflow +def test_pwpolyf_sigmoid_multi_consumer_no_silu(): + """Sigmoid with multiple consumers becomes standalone sigmoid, not silu.""" + num_channels = 8 + shape = [1, num_channels] + inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, shape) + outp1 = helper.make_tensor_value_info("outp1", TensorProto.FLOAT, shape) + outp2 = helper.make_tensor_value_info("outp2", TensorProto.FLOAT, shape) + sig_out = helper.make_tensor_value_info("sig_out", TensorProto.FLOAT, shape) + + sigmoid_node = helper.make_node("Sigmoid", ["inp"], ["sig_out"], name="Sigmoid_0") + mul_node = helper.make_node("Mul", ["inp", "sig_out"], ["outp1"], name="Mul_0") + identity_node = helper.make_node("Identity", ["sig_out"], ["outp2"], name="Id_0") + + graph = helper.make_graph( + [sigmoid_node, mul_node, identity_node], "test_graph", + [inp], [outp1, outp2], + ) + model = helper.make_model(graph, producer_name="test") + model = ModelWrapper(model) + model.graph.value_info.append(sig_out) + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) + + model = model.transform(InferPWPolyFLayer()) + + pwp_nodes = [n for n in model.graph.node if n.op_type == "PWPolyF"] + assert len(pwp_nodes) == 1 + inst = getCustomOp(pwp_nodes[0]) + assert inst.get_nodeattr("func") == "sigmoid" + # Mul and Identity should remain + assert any(n.op_type == "Mul" for n in model.graph.node) + assert any(n.op_type == "Identity" for n in model.graph.node) + + +@pytest.mark.parametrize("op_type,expected_func", [ + ("Gelu", "gelu"), + ("Sigmoid", "sigmoid"), + ("Tanh", "tanh"), +]) +@pytest.mark.fpgadataflow +def test_pwpolyf_standard_op_execution(op_type, expected_func): + num_channels = 16 + model = make_standard_activation_model(op_type, num_channels, [1]) + model = model.transform(InferPWPolyFLayer()) + + x = np.random.uniform(-5, 5, (1, num_channels)).astype(np.float32) + y_produced = oxe.execute_onnx(model, {"inp": x})["outp"] + + ref_mod = PiecewisePolyActivation(expected_func, K=3) + with torch.no_grad(): + y_expected = ref_mod(torch.from_numpy(x)).numpy() + assert np.allclose(y_produced, y_expected, atol=1e-6) + + +@pytest.mark.fpgadataflow +def test_pwpolyf_silu_pattern_execution(): + num_channels = 16 + model = make_silu_pattern_model(num_channels, [1]) + model = model.transform(InferPWPolyFLayer()) + + x = np.random.uniform(-5, 5, (1, num_channels)).astype(np.float32) + y_produced = oxe.execute_onnx(model, {"inp": x})["outp"] + + ref_mod = PiecewisePolyActivation("silu", K=3) + with torch.no_grad(): + y_expected = ref_mod(torch.from_numpy(x)).numpy() + assert np.allclose(y_produced, y_expected, atol=1e-6) + + +# ---------- Erf-based GELU inference tests ---------- + + +@pytest.mark.parametrize("num_channels", [4, 16]) +@pytest.mark.parametrize("num_input_vecs", [[1], [1, 2, 2]]) +@pytest.mark.fpgadataflow +def test_pwpolyf_infer_erf_gelu_pattern(num_channels, num_input_vecs): + """Erf-based GELU decomposition (opset < 20) is converted to PWPolyF.""" + model = make_erf_gelu_model(num_channels, num_input_vecs) + + assert len(model.graph.node) == 5 + assert model.graph.node[1].op_type == "Erf" + + model = model.transform(InferPWPolyFLayer()) + + assert len(model.graph.node) == 1 + node = model.graph.node[0] + assert node.op_type == "PWPolyF" + assert node.domain == "finn.custom_op.fpgadataflow" + + inst = getCustomOp(node) + assert inst.get_nodeattr("func") == "gelu" + assert inst.get_nodeattr("K") == 3 + assert inst.get_nodeattr("NumChannels") == num_channels + assert inst.get_nodeattr("PE") == 1 + assert inst.get_nodeattr("inputDataType") == "FLOAT32" + + +@pytest.mark.fpgadataflow +def test_pwpolyf_erf_gelu_execution(): + """Erf-based GELU produces same output as PiecewisePolyActivation.""" + num_channels = 16 + model = make_erf_gelu_model(num_channels, [1]) + model = model.transform(InferPWPolyFLayer()) + + x = np.random.uniform(-5, 5, (1, num_channels)).astype(np.float32) + y_produced = oxe.execute_onnx(model, {"inp": x})["outp"] + + ref_mod = PiecewisePolyActivation("gelu", K=3) + with torch.no_grad(): + y_expected = ref_mod(torch.from_numpy(x)).numpy() + assert np.allclose(y_produced, y_expected, atol=1e-6) From 752ecd0a1af2ed85d2a147c3def3b791595e5af9 Mon Sep 17 00:00:00 2001 From: ollycassidy13 Date: Thu, 23 Apr 2026 13:17:38 +0100 Subject: [PATCH 03/12] svh -> pkg and all k --- docs/finn/pwpolyf.md | 26 +- finn-rtllib/pwpolyf/hdl/pwpolyf.abc | 5 + finn-rtllib/pwpolyf/hdl/pwpolyf.sv | 164 +++----- finn-rtllib/pwpolyf/hdl/pwpolyf_coeffs.svh | 344 --------------- finn-rtllib/pwpolyf/hdl/pwpolyf_pkg.sv | 395 ++++++++++++++++++ finn-rtllib/pwpolyf/hdl/pwpolyf_tb.sv | 145 +++++++ .../custom_op/fpgadataflow/rtl/pwpolyf_rtl.py | 71 +++- .../fpgadataflow/convert_to_hw_layers.py | 23 +- 8 files changed, 683 insertions(+), 490 deletions(-) create mode 100644 finn-rtllib/pwpolyf/hdl/pwpolyf.abc delete mode 100644 finn-rtllib/pwpolyf/hdl/pwpolyf_coeffs.svh create mode 100644 finn-rtllib/pwpolyf/hdl/pwpolyf_pkg.sv create mode 100644 finn-rtllib/pwpolyf/hdl/pwpolyf_tb.sv diff --git a/docs/finn/pwpolyf.md b/docs/finn/pwpolyf.md index a2d6544e04..cd8510a7ef 100644 --- a/docs/finn/pwpolyf.md +++ b/docs/finn/pwpolyf.md @@ -3,9 +3,12 @@ ## Overview PWPolyF is a hardware activation layer that approximates nonlinear functions -(GELU, SiLU, Sigmoid, Tanh) using degree-2 piecewise polynomials. Each segment -is evaluated via Horner's method on two cascaded DSPFP32 FMA units, giving -single-cycle-per-element throughput with no BRAM usage. +(GELU, SiLU, Sigmoid, Tanh) using piecewise polynomials evaluated via Horner's +method on a chain of DSPFP32 FMA units. With the default degree 2, this uses +two cascaded DSPs per PE, giving single-cycle-per-element throughput with no +BRAM usage. Per-function configuration (clamping behaviour and polynomial +coefficients) is delivered through a SystemVerilog package (`pwpolyf_pkg`) +using a `func_cfg_t` struct. The input domain is partitioned into `1 + 2*5*(2^K)` segments: one near-zero region, positive octave sub-segments, and negative mirrors. With the default @@ -13,12 +16,11 @@ K=3 this gives 81 segments. Segment selection reuses the FP32 exponent/mantissa bit-fields directly, matching the RTL implementation. Polynomial coefficients are generated at HDL build time by -`generate_coeffs_svh()` in `pwpolyf_rtl.py`, which fits degree-2 polynomials -to the reference PyTorch functions and writes the `pwpolyf_coeffs.svh` header. -This ensures the RTL coefficients always match the configured K value. - -> **Note:** The RTL currently only supports K=3. Support for other K values -> is planned for a future update to `pwpolyf.sv`. +`generate_coeffs_pkg()` in `pwpolyf_rtl.py`, which fits degree-2 polynomials +to the reference PyTorch functions and writes `pwpolyf_pkg.sv` — a +SystemVerilog package with one `func_cfg_t` struct per activation +(clamping config + coefficient table). K can take any value; it defaults +to 3 when inferred from standard ONNX ops. ## Architecture @@ -130,7 +132,7 @@ Attributes on the explicit PWPolyF ONNX node: | File | Purpose | |------|---------| | `custom_op/fpgadataflow/pwpolyf.py` | Base HW op (shape, folding, resource estimates, cppsim) | -| `custom_op/fpgadataflow/rtl/pwpolyf_rtl.py` | RTL backend (HDL generation, coefficient SVH generation, rtlsim, IPI) | +| `custom_op/fpgadataflow/rtl/pwpolyf_rtl.py` | RTL backend (HDL generation, package generation, rtlsim, IPI) | | `util/pwpolyf.py` | PyTorch activation module, ONNX export, software simulation | | `transformation/fpgadataflow/convert_to_hw_layers.py` | `InferPWPolyFLayer` transformation | | `builder/build_dataflow_steps.py` | Build pipeline integration | @@ -140,8 +142,8 @@ Attributes on the explicit PWPolyF ONNX node: | File | Purpose | |------|---------| -| `finn-rtllib/pwpolyf/hdl/pwpolyf.sv` | Core polynomial evaluation pipeline | -| `finn-rtllib/pwpolyf/hdl/pwpolyf_coeffs.svh` | Default K=3 coefficients (regenerated at build time) | +| `finn-rtllib/pwpolyf/hdl/pwpolyf_pkg.sv` | `func_cfg_t` struct per activation (coeffs + clamp config, regenerated per K) | +| `finn-rtllib/pwpolyf/hdl/pwpolyf.sv` | Polynomial evaluation pipeline (Horner chain on DSPFP32) | | `finn-rtllib/pwpolyf/hdl/queue.sv` | Elastic FIFO for backpressure | | `finn-rtllib/pwpolyf/hdl/pwpolyf_template_wrapper.v` | AXI-Stream wrapper template | diff --git a/finn-rtllib/pwpolyf/hdl/pwpolyf.abc b/finn-rtllib/pwpolyf/hdl/pwpolyf.abc new file mode 100644 index 0000000000..06b77b967d --- /dev/null +++ b/finn-rtllib/pwpolyf/hdl/pwpolyf.abc @@ -0,0 +1,5 @@ +import queue +read_sv pwpolyf_pkg.sv +read_sv pwpolyf.sv +setup_tb pwpolyf_tb +setup_top pwpolyf \ No newline at end of file diff --git a/finn-rtllib/pwpolyf/hdl/pwpolyf.sv b/finn-rtllib/pwpolyf/hdl/pwpolyf.sv index 51196a9db6..a2257fe17f 100644 --- a/finn-rtllib/pwpolyf/hdl/pwpolyf.sv +++ b/finn-rtllib/pwpolyf/hdl/pwpolyf.sv @@ -10,20 +10,21 @@ * @description * Supports GELU, SiLU, Sigmoid, and Tanh via `parameter string FUNC`. * - * Approximated by piecewise degree-2 polynomials over segments defined - * by FP32 bit-extraction. Evaluated via Horner's method on a chain of - * 2 DSPFP32 instances, each computing FMA: out = C + A*B. + * Approximated by piecewise degree-D polynomials over segments defined + * by FP32 bit-extraction, where D = DEGREE from pwpolyf_pkg. + * Evaluated via Horner's method on a chain + * of D DSPFP32 instances, each computing FMA: out = C + A*B. * - * Horner: y = a_0 + x*(a_1 + a_2*x) - * Stage 0: out = a_1 + a_2 * x (A=coeff[2], B=x, C=coeff[1]) - * Stage 1: out = a_0 + prev * x (A=prev, B=x, C=coeff[0]) + * Horner (degree D): y = a_0 + x*(a_1 + x*(... + x*a_D)) + * Stage 0: out = a_{D-1} + a_D * x + * Stage j: out = a_{D-1-j} + prev * x (j = 1 .. D-1) * * Clamping for |x| >= 8 (5 octaves): * GELU/SiLU: neg -> 0, pos -> x (pass-through) * Sigmoid: neg -> 0, pos -> 1.0 * Tanh: neg -> -1, pos -> 1.0 * - * Latency: 8 cycles (2 DSP stages x 4 cycles each). II=1. + * Latency: D * DSP_LAT cycles (D DSP stages x 4 cycles each). II=1. ***************************************************************************/ //===----------------------------------------------------------------------===// @@ -47,9 +48,9 @@ module pwpolyf_dspfp32 ( // FPOPMODE[1:0] = 01 (FP mode enable) localparam logic [6:0] MODE_FMA = 7'b00_110_01; - logic invalid; - logic overflow; - logic underflow; + uwire invalid; + uwire overflow; + uwire underflow; DSPFP32 #( .A_FPTYPE("B32"), @@ -119,7 +120,7 @@ endmodule : pwpolyf_dspfp32 //===----------------------------------------------------------------------===// // Full PE-wide streaming activation with piecewise polynomial approximation. -// Hardcoded for DEGREE=2 from pwpolyf_coeffs.svh. +// Degree D derived from DEGREE in pwpolyf_pkg. //===----------------------------------------------------------------------===// module pwpolyf #( int unsigned PE = 1, @@ -140,18 +141,15 @@ module pwpolyf #( input logic yrdy ); - `include "pwpolyf_coeffs.svh" + import pwpolyf_pkg::*; - localparam int unsigned K = PWPOLYF_K; - localparam int unsigned NUM_SEGS = PWPOLYF_NUM_SEGS; localparam int unsigned NUM_SUBS = 1 << K; - localparam int unsigned NUM_OCTAVES = PWPOLYF_NUM_OCTAVES; localparam int unsigned DSP_LAT = 4; - localparam int unsigned LATENCY = 2 * DSP_LAT; // DEGREE=2 + localparam int unsigned LATENCY = DEGREE * DSP_LAT; initial begin - assert(PWPOLYF_DEGREE == 2) else begin - $error("%m: This implementation requires PWPOLYF_DEGREE == 2."); + assert(DEGREE >= 1) else begin + $error("%m: DEGREE must be >= 1."); $finish; end assert(FUNC == "gelu" || FUNC == "silu" || FUNC == "sigmoid" || FUNC == "tanh") else begin @@ -160,20 +158,12 @@ module pwpolyf #( end end - //=== Per-activation clamping parameters ================================== - localparam logic [31:0] NEG_CLAMP_VAL = - FUNC == "tanh" ? 32'hBF800000 : 32'h00000000; // tanh: -1.0, else: 0.0 - localparam logic [31:0] POS_CLAMP_VAL = - (FUNC == "sigmoid" || FUNC == "tanh") ? 32'h3F800000 : 32'h00000000; // sigmoid/tanh: 1.0 - localparam bit POS_PASSTHROUGH = - (FUNC == "gelu" || FUNC == "silu") ? 1 : 0; // gelu/silu: output=x - - //=== Coefficient selection =============================================== - localparam logic [31:0] COEFFS[NUM_SEGS][3] = - FUNC == "gelu" ? PWPOLYF_GELU_COEFFS : - FUNC == "silu" ? PWPOLYF_SILU_COEFFS : - FUNC == "sigmoid" ? PWPOLYF_SIGMOID_COEFFS : - PWPOLYF_TANH_COEFFS; + //=== Per-activation configuration ======================================= + localparam func_cfg_t CFG = + FUNC == "gelu" ? GELU : + FUNC == "silu" ? SILU : + FUNC == "sigmoid" ? SIGMOID : + TANH; //=== Clamping exponent threshold ========================================= localparam int unsigned EXP_CLAMP = 130; // |x| >= 8.0 @@ -214,7 +204,7 @@ module pwpolyf #( uwire [PE-1:0] rvld_vec; uwire rvld; - for(genvar pe = 0; pe < PE; pe++) begin : genPE + for(genvar pe = 0; pe < PE; pe++) begin : gen_pe uwire [31:0] xi = x_cur[pe]; //--- Segment selector (combinational) -------------------------------- @@ -232,22 +222,17 @@ module pwpolyf #( // Segment index for ROM lookup uwire [6:0] seg_idx; - if(1) begin : blkSegIdx + if(1) begin : blk_seg_idx uwire [6:0] pos_idx = 7'd1 + {1'b0, octave, sub}; uwire [6:0] neg_idx = 7'(7'd1 + NUM_SUBS * NUM_OCTAVES) + {1'b0, octave, sub}; assign seg_idx = is_near_zero? 7'd0 : sign? neg_idx : pos_idx; - end : blkSegIdx + end : blk_seg_idx - //--- Coefficient lookup (combinational) ------------------------------ - uwire [31:0] coeff_a0 = COEFFS[seg_idx][0]; - uwire [31:0] coeff_a1 = COEFFS[seg_idx][1]; - uwire [31:0] coeff_a2 = COEFFS[seg_idx][2]; - - //--- Horner chain: 2 stages of pwpolyf_dspfp32 ---------------------- - // Stage 0: s0 = a1 + a2 * x (latency: 4 cycles) - // Stage 1: s1 = a0 + s0 * x (latency: 4 cycles) - // Total: 8 cycles + //--- Horner chain: DEGREE stages of pwpolyf_dspfp32 ------------------ + // Stage 0: s[0] = coeff[DEGREE-1] + coeff[DEGREE] * x + // Stage j: s[j] = coeff[DEGREE-1-j] + s[j-1] * x_delayed + // Total: DEGREE * DSP_LAT cycles // Valid pipeline logic [LATENCY-1:0] Vld = '0; @@ -257,57 +242,42 @@ module pwpolyf #( end assign rvld_vec[pe] = Vld[$left(Vld)]; - // Delay x by 4 cycles for stage 1 input - logic [31:0] Xd1 = 'x; - logic [31:0] Xd2 = 'x; - logic [31:0] Xd3 = 'x; - logic [31:0] Xd4 = 'x; - always_ff @(posedge clk) begin - Xd1 <= xi; - Xd2 <= Xd1; - Xd3 <= Xd2; - Xd4 <= Xd3; - end - - // Delay x by 8 cycles for pass-through on positive clamp - logic [31:0] Xd5 = 'x; - logic [31:0] Xd6 = 'x; - logic [31:0] Xd7 = 'x; - logic [31:0] Xd8 = 'x; - always_ff @(posedge clk) begin - Xd5 <= Xd4; - Xd6 <= Xd5; - Xd7 <= Xd6; - Xd8 <= Xd7; - end - - // Delay a0 by 4 cycles for stage 1 C input - logic [31:0] C0d1 = 'x; - logic [31:0] C0d2 = 'x; - logic [31:0] C0d3 = 'x; - logic [31:0] C0d4 = 'x; + // Delay x for DSP B inputs and pass-through clamp + logic [31:0] XDly[LATENCY] = '{default: 'x}; always_ff @(posedge clk) begin - C0d1 <= coeff_a0; - C0d2 <= C0d1; - C0d3 <= C0d2; - C0d4 <= C0d3; + XDly[0] <= xi; + for(int i = 1; i < LATENCY; i++) + XDly[i] <= XDly[i-1]; end - // Stage 0: s0 = coeff_a1 + coeff_a2 * xi - uwire [31:0] s0; - pwpolyf_dspfp32 dsp0 ( - .clk, .rst, - .a(coeff_a2), .b(xi), .c(coeff_a1), - .r(s0), .rvld(Vld[3]) - ); - - // Stage 1: s1 = a0_delayed + s0 * x_delayed - uwire [31:0] s1; - pwpolyf_dspfp32 dsp1 ( - .clk, .rst, - .a(s0), .b(Xd4), .c(C0d4), - .r(s1), .rvld(Vld[7]) - ); + // DSP chain + uwire [31:0] s[DEGREE]; + + for(genvar j = 0; j < DEGREE; j++) begin : genDSP + uwire [31:0] dsp_a = (j == 0)? CFG.coeffs[seg_idx][DEGREE] : s[j-1]; + uwire [31:0] dsp_b = (j == 0)? xi : XDly[j*DSP_LAT - 1]; + + // C input: coeff[DEGREE-1-j] delayed by j*DSP_LAT cycles + logic [31:0] dsp_c; + if(j == 0) begin : genCdir + assign dsp_c = CFG.coeffs[seg_idx][DEGREE-1]; + end : genCdir + else begin : genCdly + logic [31:0] CDly[j*DSP_LAT] = '{default: 'x}; + always_ff @(posedge clk) begin + CDly[0] <= CFG.coeffs[seg_idx][DEGREE-1-j]; + for(int i = 1; i < j*DSP_LAT; i++) + CDly[i] <= CDly[i-1]; + end + assign dsp_c = CDly[j*DSP_LAT - 1]; + end : genCdly + + pwpolyf_dspfp32 dsp ( + .clk, .rst, + .a(dsp_a), .b(dsp_b), .c(dsp_c), + .r(s[j]), .rvld(Vld[(j+1)*DSP_LAT - 1]) + ); + end : genDSP //--- Clamp mux ------------------------------------------------------- logic [LATENCY-1:0] NegClamp = '0; @@ -324,11 +294,11 @@ module pwpolyf #( end // Output mux - assign r[pe] = NegClamp[$left(NegClamp)]? NEG_CLAMP_VAL : - PosClamp[$left(PosClamp)]? (POS_PASSTHROUGH? Xd8 : POS_CLAMP_VAL) : - s1; + assign r[pe] = NegClamp[$left(NegClamp)]? CFG.neg_clamp : + PosClamp[$left(PosClamp)]? (CFG.pos_passthrough? XDly[LATENCY-1] : CFG.pos_clamp) : + s[DEGREE-1]; - end : genPE + end : gen_pe // All PE results should be valid simultaneously assign rvld = rvld_vec[0]; @@ -353,4 +323,4 @@ module pwpolyf #( end end -endmodule : pwpolyf +endmodule : pwpolyf \ No newline at end of file diff --git a/finn-rtllib/pwpolyf/hdl/pwpolyf_coeffs.svh b/finn-rtllib/pwpolyf/hdl/pwpolyf_coeffs.svh deleted file mode 100644 index 4783a69a8c..0000000000 --- a/finn-rtllib/pwpolyf/hdl/pwpolyf_coeffs.svh +++ /dev/null @@ -1,344 +0,0 @@ -// Auto-generated by pwpolyf_sim.py — do not edit manually. -// K=3, NUM_SEGS=81, NUM_OCTAVES=5, DEGREE=2 - -localparam int unsigned PWPOLYF_K = 3; -localparam int unsigned PWPOLYF_NUM_SEGS = 81; -localparam int unsigned PWPOLYF_NUM_OCTAVES = 5; -localparam int unsigned PWPOLYF_DEGREE = 2; - -localparam logic [31:0] PWPOLYF_GELU_COEFFS[81][3] = '{ - '{ 32'h37B98E70, 32'h3F000000, 32'h3ECA71FE }, // seg 0 - '{ 32'hBA7ADBC7, 32'h3F0278C5, 32'h3EBE3708 }, // seg 1 - '{ 32'hBAC1FAC3, 32'h3F036C36, 32'h3EBAD62D }, // seg 2 - '{ 32'hBB0F7119, 32'h3F049537, 32'h3EB7205C }, // seg 3 - '{ 32'hBB4C3199, 32'h3F05F665, 32'h3EB31D7A }, // seg 4 - '{ 32'hBB8CE270, 32'h3F0793DC, 32'h3EAECF5D }, // seg 5 - '{ 32'hBBBD42FF, 32'h3F096FF1, 32'h3EAA3BDC }, // seg 6 - '{ 32'hBBF86AD7, 32'h3F0B8C93, 32'h3EA5686A }, // seg 7 - '{ 32'hBC1FE938, 32'h3F0DEDAD, 32'h3EA05544 }, // seg 8 - '{ 32'hBC61967B, 32'h3F11FAE4, 32'h3E985544 }, // seg 9 - '{ 32'hBCA9F9E2, 32'h3F1853F6, 32'h3E8D0DA0 }, // seg 10 - '{ 32'hBCF4024D, 32'h3F1FBA03, 32'h3E81380E }, // seg 11 - '{ 32'hBD283275, 32'h3F281F23, 32'h3E6A0515 }, // seg 12 - '{ 32'hBD6012DC, 32'h3F316E8A, 32'h3E5131A8 }, // seg 13 - '{ 32'hBD90E65A, 32'h3F3B8AB1, 32'h3E384EA9 }, // seg 14 - '{ 32'hBDB69B12, 32'h3F46505E, 32'h3E1FAEE2 }, // seg 15 - '{ 32'hBDE0E236, 32'h3F519680, 32'h3E07A0FB }, // seg 16 - '{ 32'hBE13A447, 32'h3F62F98E, 32'h3DCA8920 }, // seg 17 - '{ 32'hBE483C53, 32'h3F7A5DD5, 32'h3D6E8B3A }, // seg 18 - '{ 32'hBE7FADD1, 32'h3F8848AA, 32'h3CC08378 }, // seg 19 - '{ 32'hBE9AF2F6, 32'h3F9227E3, 32'hBB967C2C }, // seg 20 - '{ 32'hBEB35A6C, 32'h3F9A4E1A, 32'hBCD3D2D3 }, // seg 21 - '{ 32'hBEC733E2, 32'h3FA06D38, 32'hBD2655D9 }, // seg 22 - '{ 32'hBED5117D, 32'h3FA46699, 32'hBD4ACA8A }, // seg 23 - '{ 32'hBEDC221E, 32'h3FA64B98, 32'hBD5B0CC8 }, // seg 24 - '{ 32'hBED977F6, 32'h3FA5A98F, 32'hBD563F50 }, // seg 25 - '{ 32'hBEC1A7EF, 32'h3FA066C3, 32'hBD310A4D }, // seg 26 - '{ 32'hBE9AF247, 32'h3F98AA16, 32'hBCFF13C8 }, // seg 27 - '{ 32'hBE609014, 32'h3F90E5FD, 32'hBCA4952F }, // seg 28 - '{ 32'hBE1465DA, 32'h3F8A89C8, 32'hBC412BC3 }, // seg 29 - '{ 32'hBDB39147, 32'h3F860470, 32'hBBCFC6D8 }, // seg 30 - '{ 32'hBD47BD32, 32'h3F832984, 32'hBB4E0A09 }, // seg 31 - '{ 32'hBCCC65EE, 32'h3F8187F7, 32'hBABC9CBA }, // seg 32 - '{ 32'hBC07A969, 32'h3F807817, 32'hB9D51508 }, // seg 33 - '{ 32'hBABA43A4, 32'h3F8012B3, 32'hB870A727 }, // seg 34 - '{ 32'hB93762D6, 32'h3F800216, 32'hB6C22359 }, // seg 35 - '{ 32'h3411E06E, 32'h3F800000, 32'h27AB3551 }, // seg 36 - '{ 32'h341E8FEE, 32'h3F800000, 32'h28A3B0E4 }, // seg 37 - '{ 32'h342B3EFE, 32'h3F800000, 32'hA7CDB1C0 }, // seg 38 - '{ 32'h3437EE42, 32'h3F800000, 32'hA8538CE4 }, // seg 39 - '{ 32'h34449DB9, 32'h3F800000, 32'hA71AF986 }, // seg 40 - '{ 32'hBA7AD37E, 32'h3EFB0E96, 32'h3EBE3747 }, // seg 41 - '{ 32'hBAC20DB3, 32'h3EF92715, 32'h3EBAD556 }, // seg 42 - '{ 32'hBB0F6B5B, 32'h3EF6D5D9, 32'h3EB720C8 }, // seg 43 - '{ 32'hBB4C290B, 32'h3EF41395, 32'h3EB31DFE }, // seg 44 - '{ 32'hBB8CE04D, 32'h3EF0D873, 32'h3EAECF95 }, // seg 45 - '{ 32'hBBBD43D9, 32'h3EED2010, 32'h3EAA3BCD }, // seg 46 - '{ 32'hBBF87DB4, 32'h3EE8E58C, 32'h3EA566F9 }, // seg 47 - '{ 32'hBC1FE3E0, 32'h3EE42555, 32'h3EA055F8 }, // seg 48 - '{ 32'hBC6197C8, 32'h3EDC0A12, 32'h3E98551F }, // seg 49 - '{ 32'hBCA9FA8F, 32'h3ECF57EF, 32'h3E8D0D83 }, // seg 50 - '{ 32'hBCF40310, 32'h3EC08BD5, 32'h3E8137F1 }, // seg 51 - '{ 32'hBD2834D6, 32'h3EAFC0E3, 32'h3E6A03EA }, // seg 52 - '{ 32'hBD6013D1, 32'h3E9D229D, 32'h3E513144 }, // seg 53 - '{ 32'hBD90E5AD, 32'h3E88EB08, 32'h3E384F28 }, // seg 54 - '{ 32'hBDB6985F, 32'h3E66C185, 32'h3E1FB08A }, // seg 55 - '{ 32'hBDE0DF77, 32'h3E39A8DB, 32'h3E07A275 }, // seg 56 - '{ 32'hBE13A40F, 32'h3DE8345F, 32'h3DCA8983 }, // seg 57 - '{ 32'hBE483C8C, 32'h3CB44289, 32'h3D6E8AAF }, // seg 58 - '{ 32'hBE7FAE75, 32'hBD848C95, 32'h3CC08085 }, // seg 59 - '{ 32'hBE9AF25E, 32'hBE113D70, 32'hBB9669BA }, // seg 60 - '{ 32'hBEB35AF3, 32'hBE527226, 32'hBCD3D632 }, // seg 61 - '{ 32'hBEC73409, 32'hBE81B50C, 32'hBD265643 }, // seg 62 - '{ 32'hBED511FB, 32'hBE919AF1, 32'hBD4ACBC1 }, // seg 63 - '{ 32'hBEDC2198, 32'hBE992DD4, 32'hBD5B0BA5 }, // seg 64 - '{ 32'hBED9784C, 32'hBE96A68E, 32'hBD563FE9 }, // seg 65 - '{ 32'hBEC1A80B, 32'hBE819B22, 32'hBD310A73 }, // seg 66 - '{ 32'hBE9AF281, 32'hBE45510D, 32'hBCFF1457 }, // seg 67 - '{ 32'hBE60906A, 32'hBE073026, 32'hBCA4958A }, // seg 68 - '{ 32'hBE146346, 32'hBDA8992A, 32'hBC41276D }, // seg 69 - '{ 32'hBDB38EDC, 32'hBD408B15, 32'hBBCFC36C }, // seg 70 - '{ 32'hBD47AFC4, 32'hBCCA5237, 32'hBB4DF9C3 }, // seg 71 - '{ 32'hBCCC9FAA, 32'hBC443685, 32'hBABCD98B }, // seg 72 - '{ 32'hBC07BF94, 32'hBB7057D9, 32'hB9D53AA2 }, // seg 73 - '{ 32'hBABACF46, 32'hBA160B9E, 32'hB8715A94 }, // seg 74 - '{ 32'hB93E544F, 32'hB88BBAC8, 32'hB6CD58B9 }, // seg 75 - '{ 32'hB80D0FA0, 32'hB7425B86, 32'hB585D73F }, // seg 76 - '{ 32'h00000000, 32'h00000000, 32'h00000000 }, // seg 77 - '{ 32'h00000000, 32'h00000000, 32'h00000000 }, // seg 78 - '{ 32'h00000000, 32'h00000000, 32'h00000000 }, // seg 79 - '{ 32'h00000000, 32'h00000000, 32'h00000000 } // seg 80 -}; - -localparam logic [31:0] PWPOLYF_SILU_COEFFS[81][3] = '{ - '{ 32'h36E95DF5, 32'h3F000000, 32'h3E7EDC5E }, // seg 0 - '{ 32'hB99F1DCE, 32'h3F00C86D, 32'h3E771EC2 }, // seg 1 - '{ 32'hB9F6B213, 32'h3F01162E, 32'h3E74F652 }, // seg 2 - '{ 32'hBA36FFF7, 32'h3F017588, 32'h3E72946C }, // seg 3 - '{ 32'hBA82DBFB, 32'h3F01E7EE, 32'h3E6FFB3C }, // seg 4 - '{ 32'hBAB54FCB, 32'h3F026E5A, 32'h3E6D2ECC }, // seg 5 - '{ 32'hBAF49B79, 32'h3F030A0C, 32'h3E6A30AE }, // seg 6 - '{ 32'hBB212F9B, 32'h3F03BBB5, 32'h3E6704CC }, // seg 7 - '{ 32'hBB50782F, 32'h3F048563, 32'h3E63A86F }, // seg 8 - '{ 32'hBB945CE5, 32'h3F05E1D4, 32'h3E5E4873 }, // seg 9 - '{ 32'hBBE25E30, 32'h3F080BF2, 32'h3E569788 }, // seg 10 - '{ 32'hBC24C259, 32'h3F0A9F94, 32'h3E4E59B3 }, // seg 11 - '{ 32'hBC66B42F, 32'h3F0D9E75, 32'h3E45A38E }, // seg 12 - '{ 32'hBC9C5244, 32'h3F11080E, 32'h3E3C8A8D }, // seg 13 - '{ 32'hBCCE04C3, 32'h3F14DA5B, 32'h3E3322D2 }, // seg 14 - '{ 32'hBD04800C, 32'h3F191096, 32'h3E298289 }, // seg 15 - '{ 32'hBD26E43B, 32'h3F1DA640, 32'h3E1FBAD7 }, // seg 16 - '{ 32'hBD637E9F, 32'h3F252125, 32'h3E10F462 }, // seg 17 - '{ 32'hBDA33B9D, 32'h3F301FA3, 32'h3DFAD009 }, // seg 18 - '{ 32'hBDDE6B32, 32'h3F3BF5F1, 32'h3DD4EBDE }, // seg 19 - '{ 32'hBE1121CB, 32'h3F484C5B, 32'h3DB103B4 }, // seg 20 - '{ 32'hBE369CC7, 32'h3F54CBA1, 32'h3D8FABD4 }, // seg 21 - '{ 32'hBE5EB1E1, 32'h3F612225, 32'h3D6290D3 }, // seg 22 - '{ 32'hBE842968, 32'h3F6D086D, 32'h3D2C2202 }, // seg 23 - '{ 32'hBE99340A, 32'h3F7842D6, 32'h3CF862F4 }, // seg 24 - '{ 32'hBEB7B0F6, 32'h3F83AB48, 32'h3C810FDC }, // seg 25 - '{ 32'hBEDD21FD, 32'h3F8C031A, 32'h3AA0895B }, // seg 26 - '{ 32'hBEFBE1D7, 32'h3F922E5A, 32'hBC0A6628 }, // seg 27 - '{ 32'hBF0931A3, 32'h3F9649B5, 32'hBC6A59DD }, // seg 28 - '{ 32'hBF101E8F, 32'h3F989B85, 32'hBC8E0AB0 }, // seg 29 - '{ 32'hBF12EEED, 32'h3F997B24, 32'hBC96B85F }, // seg 30 - '{ 32'hBF121CE8, 32'h3F994071, 32'hBC94AB86 }, // seg 31 - '{ 32'hBF0E4CE0, 32'h3F983CFA, 32'hBC8C0C06 }, // seg 32 - '{ 32'hBF047881, 32'h3F95D0C8, 32'hBC71DF34 }, // seg 33 - '{ 32'hBEE597C0, 32'h3F91E386, 32'hBC3A0377 }, // seg 34 - '{ 32'hBEBEB59D, 32'h3F8DFF08, 32'hBC081E5A }, // seg 35 - '{ 32'hBE994535, 32'h3F8A965C, 32'hBBC0C1D9 }, // seg 36 - '{ 32'hBE700CF7, 32'h3F87CFE4, 32'hBB856F55 }, // seg 37 - '{ 32'hBE37FE86, 32'h3F85A6F4, 32'hBB35A189 }, // seg 38 - '{ 32'hBE0A8F49, 32'h3F8406CD, 32'hBAF420DA }, // seg 39 - '{ 32'hBDCD8C89, 32'h3F82D4E6, 32'hBAA265F3 }, // seg 40 - '{ 32'hB99F15B4, 32'h3EFE6F36, 32'h3E771F07 }, // seg 41 - '{ 32'hB9F6E275, 32'h3EFDD355, 32'h3E74F54A }, // seg 42 - '{ 32'hBA370764, 32'h3EFD14DB, 32'h3E729434 }, // seg 43 - '{ 32'hBA82CBD2, 32'h3EFC307E, 32'h3E6FFC3C }, // seg 44 - '{ 32'hBAB519A0, 32'h3EFB2461, 32'h3E6D318E }, // seg 45 - '{ 32'hBAF49002, 32'h3EF9EC20, 32'h3E6A3138 }, // seg 46 - '{ 32'hBB213894, 32'h3EF88848, 32'h3E670422 }, // seg 47 - '{ 32'hBB509289, 32'h3EF6F461, 32'h3E63A6AF }, // seg 48 - '{ 32'hBB946026, 32'h3EF43C28, 32'h3E5E481D }, // seg 49 - '{ 32'hBBE25D58, 32'h3EEFE827, 32'h3E56979C }, // seg 50 - '{ 32'hBC24C311, 32'h3EEAC0C7, 32'h3E4E599C }, // seg 51 - '{ 32'hBC66B0C8, 32'h3EE4C363, 32'h3E45A3FC }, // seg 52 - '{ 32'hBC9C5250, 32'h3EDDEFE3, 32'h3E3C8A8A }, // seg 53 - '{ 32'hBCCE0263, 32'h3ED64BA6, 32'h3E33233F }, // seg 54 - '{ 32'hBD0483B9, 32'h3ECDDDD0, 32'h3E29816B }, // seg 55 - '{ 32'hBD26E21A, 32'h3EC4B40E, 32'h3E1FBB6C }, // seg 56 - '{ 32'hBD637E94, 32'h3EB5BDB7, 32'h3E10F463 }, // seg 57 - '{ 32'hBDA33BD9, 32'h3E9FC0A2, 32'h3DFACFDF }, // seg 58 - '{ 32'hBDDE6AB7, 32'h3E88144E, 32'h3DD4EC2A }, // seg 59 - '{ 32'hBE11213C, 32'h3E5ECF5B, 32'h3DB10440 }, // seg 60 - '{ 32'hBE369BA8, 32'h3E2CD2EE, 32'h3D8FACC1 }, // seg 61 - '{ 32'hBE5EB20C, 32'h3DF6EE82, 32'h3D6290A4 }, // seg 62 - '{ 32'hBE8428DF, 32'h3D97BEFB, 32'h3D2C2354 }, // seg 63 - '{ 32'hBE9933B8, 32'h3CF7AA94, 32'h3CF8644F }, // seg 64 - '{ 32'hBEB7B09B, 32'hBCEACCA1, 32'h3C811126 }, // seg 65 - '{ 32'hBEDD2230, 32'hBDC0323A, 32'h3AA081A5 }, // seg 66 - '{ 32'hBEFBE177, 32'hBE11723B, 32'hBC0A645F }, // seg 67 - '{ 32'hBF093217, 32'hBE324EF2, 32'hBC6A5D77 }, // seg 68 - '{ 32'hBF101F44, 32'hBE44DDF5, 32'hBC8E0CF8 }, // seg 69 - '{ 32'hBF12EEFF, 32'hBE4BD952, 32'hBC96B899 }, // seg 70 - '{ 32'hBF121E42, 32'hBE4A0685, 32'hBC94AED3 }, // seg 71 - '{ 32'hBF0E4E2A, 32'hBE41EA78, 32'hBC8C0EC0 }, // seg 72 - '{ 32'hBF047922, 32'hBE2E876E, 32'hBC71E175 }, // seg 73 - '{ 32'hBEE5994E, 32'hBE0F1D79, 32'hBC3A059D }, // seg 74 - '{ 32'hBEBEB455, 32'hBDDFEE7F, 32'hBC081CCC }, // seg 75 - '{ 32'hBE9948EA, 32'hBDA96AD6, 32'hBBC0C8E9 }, // seg 76 - '{ 32'hBE701045, 32'hBD7A00B1, 32'hBB8571EB }, // seg 77 - '{ 32'hBE3805EB, 32'hBD34E735, 32'hBB35ABED }, // seg 78 - '{ 32'hBE0A9538, 32'hBD00E03E, 32'hBAF42F6A }, // seg 79 - '{ 32'hBDCD9AF0, 32'hBCB54853, 32'hBAA2755A } // seg 80 -}; - -localparam logic [31:0] PWPOLYF_SIGMOID_COEFFS[81][3] = '{ - '{ 32'h3F000000, 32'h3E7F33B4, 32'hB21FFF88 }, // seg 0 - '{ 32'h3EFFCF27, 32'h3E822CCD, 32'hBC84C1F2 }, // seg 1 - '{ 32'h3EFFBC74, 32'h3E82B1D2, 32'hBC938C36 }, // seg 2 - '{ 32'h3EFFA5B5, 32'h3E834361, 32'hBCA21BEF }, // seg 3 - '{ 32'h3EFF8A9F, 32'h3E83E0E0, 32'hBCB06BD4 }, // seg 4 - '{ 32'h3EFF6B53, 32'h3E8487A2, 32'hBCBE4EDE }, // seg 5 - '{ 32'h3EFF47DE, 32'h3E853610, 32'hBCCBB848 }, // seg 6 - '{ 32'h3EFF1FD8, 32'h3E85ED00, 32'hBCD8C92D }, // seg 7 - '{ 32'h3EFEF3E7, 32'h3E86A89A, 32'hBCE54D90 }, // seg 8 - '{ 32'h3EFEA7FC, 32'h3E87D4B7, 32'hBCF7D81E }, // seg 9 - '{ 32'h3EFE3434, 32'h3E897082, 32'hBD075E74 }, // seg 10 - '{ 32'h3EFDB0A7, 32'h3E8B15AC, 32'hBD11E821 }, // seg 11 - '{ 32'h3EFD1FFF, 32'h3E8CBAB3, 32'hBD1B7B94 }, // seg 12 - '{ 32'h3EFC85A0, 32'h3E8E568F, 32'hBD2411C8 }, // seg 13 - '{ 32'h3EFBE681, 32'h3E8FDE72, 32'hBD2B9C7D }, // seg 14 - '{ 32'h3EFB46C8, 32'h3E914BCF, 32'hBD322448 }, // seg 15 - '{ 32'h3EFAAB5A, 32'h3E9297AA, 32'hBD37AD8F }, // seg 16 - '{ 32'h3EF9DBB3, 32'h3E9432BF, 32'hBD3E0993 }, // seg 17 - '{ 32'h3EF909EB, 32'h3E95A9C9, 32'hBD43470B }, // seg 18 - '{ 32'h3EF8B573, 32'h3E9632E0, 32'hBD450418 }, // seg 19 - '{ 32'h3EF909F7, 32'h3E95B9B4, 32'hBD43A89C }, // seg 20 - '{ 32'h3EFA2B65, 32'h3E943959, 32'hBD3FAB94 }, // seg 21 - '{ 32'h3EFC33ED, 32'h3E91B9FE, 32'hBD3988A9 }, // seg 22 - '{ 32'h3EFF34ED, 32'h3E8E4C2C, 32'hBD31B440 }, // seg 23 - '{ 32'h3F01997B, 32'h3E8A0ACB, 32'hBD28A198 }, // seg 24 - '{ 32'h3F057A8B, 32'h3E8262C5, 32'hBD198434 }, // seg 25 - '{ 32'h3F0C3064, 32'h3E6CECF8, 32'hBD0452A9 }, // seg 26 - '{ 32'h3F1447E1, 32'h3E530782, 32'hBCDF318F }, // seg 27 - '{ 32'h3F1D4A68, 32'h3E38CE80, 32'hBCB905FB }, // seg 28 - '{ 32'h3F26C195, 32'h3E1F8C72, 32'hBC9750E8 }, // seg 29 - '{ 32'h3F3044A9, 32'h3E081DC6, 32'hBC74E58A }, // seg 30 - '{ 32'h3F3982E1, 32'h3DE5F174, 32'hBC448461 }, // seg 31 - '{ 32'h3F424019, 32'h3DC09FBD, 32'hBC1CAB6F }, // seg 32 - '{ 32'h3F4DF478, 32'h3D924A5B, 32'hBBDD9BF8 }, // seg 33 - '{ 32'h3F5B2AC7, 32'h3D464E25, 32'hBB897CAF }, // seg 34 - '{ 32'h3F656FAC, 32'h3D045E36, 32'hBB291861 }, // seg 35 - '{ 32'h3F6D25C5, 32'h3CAEBA7A, 32'hBACED519 }, // seg 36 - '{ 32'h3F72CA96, 32'h3C64B5A7, 32'hBA7C2B6D }, // seg 37 - '{ 32'h3F76D7D9, 32'h3C14B4A4, 32'hBA196B60 }, // seg 38 - '{ 32'h3F79B538, 32'h3BC060EC, 32'hB9BA7A5A }, // seg 39 - '{ 32'h3F7BB5E3, 32'h3B77B8AD, 32'hB9626A80 }, // seg 40 - '{ 32'h3F001880, 32'h3E822DEE, 32'h3C84E3F8 }, // seg 41 - '{ 32'h3F0021CE, 32'h3E82B23F, 32'h3C939805 }, // seg 42 - '{ 32'h3F002D26, 32'h3E834368, 32'h3CA21C3C }, // seg 43 - '{ 32'h3F003A99, 32'h3E83DFE3, 32'h3CB05650 }, // seg 44 - '{ 32'h3F004A09, 32'h3E84848C, 32'h3CBE0FC2 }, // seg 45 - '{ 32'h3F005BFD, 32'h3E853553, 32'h3CCBAA1A }, // seg 46 - '{ 32'h3F007027, 32'h3E85EDAA, 32'h3CD8D518 }, // seg 47 - '{ 32'h3F008689, 32'h3E86AC9A, 32'h3CE58F3C }, // seg 48 - '{ 32'h3F00AC00, 32'h3E87D4A7, 32'h3CF7D732 }, // seg 49 - '{ 32'h3F00E601, 32'h3E89713C, 32'h3D076365 }, // seg 50 - '{ 32'h3F0127C8, 32'h3E8B1652, 32'h3D11EC06 }, // seg 51 - '{ 32'h3F016FFD, 32'h3E8CBAA0, 32'h3D1B7B31 }, // seg 52 - '{ 32'h3F01BD0C, 32'h3E8E55D8, 32'h3D240E23 }, // seg 53 - '{ 32'h3F020CD3, 32'h3E8FDED1, 32'h3D2B9E4A }, // seg 54 - '{ 32'h3F025CCD, 32'h3E914CAB, 32'h3D322819 }, // seg 55 - '{ 32'h3F02AA33, 32'h3E929729, 32'h3D37AB80 }, // seg 56 - '{ 32'h3F031236, 32'h3E9432FB, 32'h3D3E0A78 }, // seg 57 - '{ 32'h3F037AF9, 32'h3E95A98C, 32'h3D43463B }, // seg 58 - '{ 32'h3F03A527, 32'h3E96327F, 32'h3D4502ED }, // seg 59 - '{ 32'h3F037B01, 32'h3E95B9AA, 32'h3D43A880 }, // seg 60 - '{ 32'h3F02EA67, 32'h3E94399C, 32'h3D3FAC3D }, // seg 61 - '{ 32'h3F01E5FE, 32'h3E91B9E3, 32'h3D39886A }, // seg 62 - '{ 32'h3F00659B, 32'h3E8E4C54, 32'h3D31B499 }, // seg 63 - '{ 32'h3EFCCCD9, 32'h3E8A0A9A, 32'h3D28A135 }, // seg 64 - '{ 32'h3EF50AC9, 32'h3E8262A6, 32'h3D1983F9 }, // seg 65 - '{ 32'h3EE79F36, 32'h3E6CECF3, 32'h3D0452A4 }, // seg 66 - '{ 32'h3ED77017, 32'h3E530747, 32'h3CDF3133 }, // seg 67 - '{ 32'h3EC56B50, 32'h3E38CEAF, 32'h3CB9063C }, // seg 68 - '{ 32'h3EB27DCB, 32'h3E1F8DAD, 32'h3C97527C }, // seg 69 - '{ 32'h3E9F76E9, 32'h3E081E0B, 32'h3C74E62F }, // seg 70 - '{ 32'h3E8CFAEA, 32'h3DE5F2F3, 32'h3C448608 }, // seg 71 - '{ 32'h3E770090, 32'h3DC0A0B8, 32'h3C1CAC71 }, // seg 72 - '{ 32'h3E482E7E, 32'h3D924AB4, 32'h3BDD9CA1 }, // seg 73 - '{ 32'h3E13552B, 32'h3D464E9B, 32'h3B897D12 }, // seg 74 - '{ 32'h3DD481AF, 32'h3D045D7B, 32'h3B29173F }, // seg 75 - '{ 32'h3D96D28F, 32'h3CAEBB74, 32'h3ACED672 }, // seg 76 - '{ 32'h3D5358A4, 32'h3C64B82E, 32'h3A7C2E99 }, // seg 77 - '{ 32'h3D1283F5, 32'h3C14B667, 32'h3A196D6B }, // seg 78 - '{ 32'h3CC95645, 32'h3BC05DE7, 32'h39BA76F4 }, // seg 79 - '{ 32'h3C8947BE, 32'h3B77C111, 32'h39627305 } // seg 80 -}; - -localparam logic [31:0] PWPOLYF_TANH_COEFFS[81][3] = '{ - '{ 32'h24C775B8, 32'h3F7CD991, 32'hA73006D1 }, // seg 0 - '{ 32'hBBAC00F6, 32'h3F87D4AF, 32'hBE77D79E }, // seg 1 - '{ 32'hBBE5F686, 32'h3F8970F2, 32'hBE87616C }, // seg 2 - '{ 32'hBC13E04D, 32'h3F8B1626, 32'hBE91EAFF }, // seg 3 - '{ 32'hBC38062F, 32'h3F8CBAF6, 32'hBE9B7D12 }, // seg 4 - '{ 32'hBC5E87C9, 32'h3F8E55E8, 32'hBEA40E6C }, // seg 5 - '{ 32'hBC833183, 32'h3F8FDE94, 32'hBEAB9D26 }, // seg 6 - '{ 32'hBC97304B, 32'h3F914C74, 32'hBEB22725 }, // seg 7 - '{ 32'hBCAA905C, 32'h3F929762, 32'hBEB7AC6C }, // seg 8 - '{ 32'hBCC48DB5, 32'h3F9432FE, 32'hBEBE0A82 }, // seg 9 - '{ 32'hBCDEBF5B, 32'h3F95A99C, 32'hBEC34673 }, // seg 10 - '{ 32'hBCE94C8C, 32'h3F9632A2, 32'hBEC50358 }, // seg 11 - '{ 32'hBCDEBE2C, 32'h3F95B992, 32'hBEC3A83E }, // seg 12 - '{ 32'hBCBA9ED9, 32'h3F9439CF, 32'hBEBFACBF }, // seg 13 - '{ 32'hBC72F4C3, 32'h3F91B9B3, 32'hBEB987F9 }, // seg 14 - '{ 32'hBB4B4F04, 32'h3F8E4C70, 32'hBEB1B4D9 }, // seg 15 - '{ 32'h3C4CBE96, 32'h3F8A0AC9, 32'hBEA8A196 }, // seg 16 - '{ 32'h3D2F5351, 32'h3F8262A8, 32'hBE9983FC }, // seg 17 - '{ 32'h3DC30600, 32'h3F6CED04, 32'hBE8452B2 }, // seg 18 - '{ 32'h3E223FEB, 32'h3F53072C, 32'hBE5F310C }, // seg 19 - '{ 32'h3E6A52FA, 32'h3F38CE9A, 32'hBE39061F }, // seg 20 - '{ 32'h3E9B0431, 32'h3F1F8DD2, 32'hBE1752AC }, // seg 21 - '{ 32'h3EC1122B, 32'h3F081E0D, 32'hBDF4E633 }, // seg 22 - '{ 32'h3EE60A55, 32'h3EE5F2C5, 32'hBDC485D6 }, // seg 23 - '{ 32'h3F047F8C, 32'h3EC0A114, 32'hBD9CACD1 }, // seg 24 - '{ 32'h3F1BE8C6, 32'h3E924AAB, 32'hBD5D9C90 }, // seg 25 - '{ 32'h3F365584, 32'h3E464E44, 32'hBD097CC9 }, // seg 26 - '{ 32'h3F4ADF8C, 32'h3E045D95, 32'hBCA91767 }, // seg 27 - '{ 32'h3F5A4B6D, 32'h3DAEBB15, 32'hBC4ED5EF }, // seg 28 - '{ 32'h3F6594EE, 32'h3D64B815, 32'hBBFC2E78 }, // seg 29 - '{ 32'h3F6DAF91, 32'h3D14B5D0, 32'hBB996CB5 }, // seg 30 - '{ 32'h3F736A9F, 32'h3CC05DAA, 32'hBB3A76B3 }, // seg 31 - '{ 32'h3F776B54, 32'h3C77C747, 32'hBAE2796C }, // seg 32 - '{ 32'h3F7B290A, 32'h3C00F553, 32'hBA590596 }, // seg 33 - '{ 32'h3F7DD423, 32'h3B51CDAD, 32'hB99FBCDD }, // seg 34 - '{ 32'h3F7F0B0E, 32'h3AA91301, 32'hB8EB151C }, // seg 35 - '{ 32'h3F7F95A3, 32'h3A073EF5, 32'hB82D041E }, // seg 36 - '{ 32'h3F7FD274, 32'h3956AD85, 32'hB77E46D5 }, // seg 37 - '{ 32'h3F7FECAF, 32'h38A9A5DF, 32'hB6BB164C }, // seg 38 - '{ 32'h3F7FF7E0, 32'h3805A163, 32'hB609E0DD }, // seg 39 - '{ 32'h3F7FFCA1, 32'h37505C2F, 32'hB549DC3A }, // seg 40 - '{ 32'h3BAC00F6, 32'h3F87D4AF, 32'h3E77D79E }, // seg 41 - '{ 32'h3BE5F686, 32'h3F8970F2, 32'h3E87616C }, // seg 42 - '{ 32'h3C13E04D, 32'h3F8B1626, 32'h3E91EAFF }, // seg 43 - '{ 32'h3C38062F, 32'h3F8CBAF6, 32'h3E9B7D12 }, // seg 44 - '{ 32'h3C5E87C9, 32'h3F8E55E8, 32'h3EA40E6C }, // seg 45 - '{ 32'h3C833183, 32'h3F8FDE94, 32'h3EAB9D26 }, // seg 46 - '{ 32'h3C97304B, 32'h3F914C74, 32'h3EB22725 }, // seg 47 - '{ 32'h3CAA905C, 32'h3F929762, 32'h3EB7AC6C }, // seg 48 - '{ 32'h3CC48DB5, 32'h3F9432FE, 32'h3EBE0A82 }, // seg 49 - '{ 32'h3CDEBF5B, 32'h3F95A99C, 32'h3EC34673 }, // seg 50 - '{ 32'h3CE94C8C, 32'h3F9632A2, 32'h3EC50358 }, // seg 51 - '{ 32'h3CDEBE2C, 32'h3F95B992, 32'h3EC3A83E }, // seg 52 - '{ 32'h3CBA9ED9, 32'h3F9439CF, 32'h3EBFACBF }, // seg 53 - '{ 32'h3C72F4C3, 32'h3F91B9B3, 32'h3EB987F9 }, // seg 54 - '{ 32'h3B4B4F04, 32'h3F8E4C70, 32'h3EB1B4D9 }, // seg 55 - '{ 32'hBC4CBE96, 32'h3F8A0AC9, 32'h3EA8A196 }, // seg 56 - '{ 32'hBD2F5351, 32'h3F8262A8, 32'h3E9983FC }, // seg 57 - '{ 32'hBDC30600, 32'h3F6CED04, 32'h3E8452B2 }, // seg 58 - '{ 32'hBE223FEB, 32'h3F53072C, 32'h3E5F310C }, // seg 59 - '{ 32'hBE6A52FA, 32'h3F38CE9A, 32'h3E39061F }, // seg 60 - '{ 32'hBE9B0431, 32'h3F1F8DD2, 32'h3E1752AC }, // seg 61 - '{ 32'hBEC1122B, 32'h3F081E0D, 32'h3DF4E633 }, // seg 62 - '{ 32'hBEE60A55, 32'h3EE5F2C5, 32'h3DC485D6 }, // seg 63 - '{ 32'hBF047F8C, 32'h3EC0A114, 32'h3D9CACD1 }, // seg 64 - '{ 32'hBF1BE8C6, 32'h3E924AAB, 32'h3D5D9C90 }, // seg 65 - '{ 32'hBF365584, 32'h3E464E44, 32'h3D097CC9 }, // seg 66 - '{ 32'hBF4ADF8C, 32'h3E045D95, 32'h3CA91767 }, // seg 67 - '{ 32'hBF5A4B6D, 32'h3DAEBB15, 32'h3C4ED5EF }, // seg 68 - '{ 32'hBF6594EE, 32'h3D64B815, 32'h3BFC2E78 }, // seg 69 - '{ 32'hBF6DAF91, 32'h3D14B5D0, 32'h3B996CB5 }, // seg 70 - '{ 32'hBF736A9F, 32'h3CC05DAA, 32'h3B3A76B3 }, // seg 71 - '{ 32'hBF776B54, 32'h3C77C747, 32'h3AE2796C }, // seg 72 - '{ 32'hBF7B290A, 32'h3C00F553, 32'h3A590596 }, // seg 73 - '{ 32'hBF7DD423, 32'h3B51CDAD, 32'h399FBCDD }, // seg 74 - '{ 32'hBF7F0B0E, 32'h3AA91301, 32'h38EB151C }, // seg 75 - '{ 32'hBF7F95A3, 32'h3A073EF5, 32'h382D041E }, // seg 76 - '{ 32'hBF7FD274, 32'h3956AD85, 32'h377E46D5 }, // seg 77 - '{ 32'hBF7FECAF, 32'h38A9A5DF, 32'h36BB164C }, // seg 78 - '{ 32'hBF7FF7E0, 32'h3805A163, 32'h3609E0DD }, // seg 79 - '{ 32'hBF7FFCA1, 32'h37505C2F, 32'h3549DC3A } // seg 80 -}; - diff --git a/finn-rtllib/pwpolyf/hdl/pwpolyf_pkg.sv b/finn-rtllib/pwpolyf/hdl/pwpolyf_pkg.sv new file mode 100644 index 0000000000..2838f03fe2 --- /dev/null +++ b/finn-rtllib/pwpolyf/hdl/pwpolyf_pkg.sv @@ -0,0 +1,395 @@ +/**************************************************************************** + * Copyright (C) 2026, Advanced Micro Devices, Inc. + * All rights reserved. + * + * SPDX-License-Identifier: BSD-3-Clause + * + * @brief Per-activation configuration for pwpolyf. + * @author Shane Fleming + * + * @description + * Package consolidating shared approximation constants and per-activation + * configuration (clamping parameters + coefficient arrays) for the + * piecewise polynomial activation unit. + * + * Coefficient data auto-generated by pwpolyf_coeffs.py -- DEGREE=2 K=3 + * NUM_OCTAVES=5. Segments: 81 Coefficients per segment: 3 + * Polynomial: y = a_0 + a_1*x + a_2*x^2 + ... + a_d*x^d + * Horner form: y = a_0 + x*(a_1 + x*(a_2 + ... x*a_d)) + * + * Segment index encoding: + * 0 = near-zero (|x| < 0.25) + * 1 .. 5*2^K = positive octaves (exp 125..129) + * 5*2^K+1 .. end = negative octaves (exp 125..129) + ***************************************************************************/ +package pwpolyf_pkg; + + localparam int unsigned DEGREE = 2; + localparam int unsigned K = 3; + localparam int unsigned NUM_OCTAVES = 5; + localparam int unsigned NUM_SEGS = 81; + + typedef struct { + logic [31:0] neg_clamp; + logic [31:0] pos_clamp; + bit pos_passthrough; + logic [31:0] coeffs[NUM_SEGS][DEGREE+1]; + } func_cfg_t; + + localparam func_cfg_t GELU = '{ + neg_clamp: 32'h00000000, + pos_clamp: 32'h00000000, + pos_passthrough: 1, + coeffs: '{ + '{ 32'h37B92D98, 32'h3F000000, 32'h3ECA7276 }, // [0] near_zero: [-0.2500, 0.2500) + '{ 32'hBA7AD2E9, 32'h3F0278B4, 32'h3EBE374D }, // [1] pos_oct0_sub0: [0.2500, 0.2812) + '{ 32'hBAC20AB3, 32'h3F036C6B, 32'h3EBAD579 }, // [2] pos_oct0_sub1: [0.2812, 0.3125) + '{ 32'hBB0F6AA9, 32'h3F04950F, 32'h3EB720D6 }, // [3] pos_oct0_sub2: [0.3125, 0.3438) + '{ 32'hBB4C32E1, 32'h3F05F66C, 32'h3EB31D68 }, // [4] pos_oct0_sub3: [0.3438, 0.3750) + '{ 32'hBB8CE121, 32'h3F0793CF, 32'h3EAECF81 }, // [5] pos_oct0_sub4: [0.3750, 0.4062) + '{ 32'hBBBD44F5, 32'h3F097002, 32'h3EAA3BB6 }, // [6] pos_oct0_sub5: [0.4062, 0.4375) + '{ 32'hBBF87F2B, 32'h3F0B8D46, 32'h3EA566DF }, // [7] pos_oct0_sub6: [0.4375, 0.4688) + '{ 32'hBC1FE374, 32'h3F0DED4E, 32'h3EA05607 }, // [8] pos_oct0_sub7: [0.4688, 0.5000) + '{ 32'hBC6196C5, 32'h3F11FAE7, 32'h3E985541 }, // [9] pos_oct1_sub0: [0.5000, 0.5625) + '{ 32'hBCA9FA9B, 32'h3F185408, 32'h3E8D0D86 }, // [10] pos_oct1_sub1: [0.5625, 0.6250) + '{ 32'hBCF402A2, 32'h3F1FBA0A, 32'h3E813805 }, // [11] pos_oct1_sub2: [0.6250, 0.6875) + '{ 32'hBD28346B, 32'h3F281F7A, 32'h3E6A0426 }, // [12] pos_oct1_sub3: [0.6875, 0.7500) + '{ 32'hBD6013B5, 32'h3F316EAB, 32'h3E513158 }, // [13] pos_oct1_sub4: [0.7500, 0.8125) + '{ 32'hBD90E5E4, 32'h3F3B8A8B, 32'h3E384F09 }, // [14] pos_oct1_sub5: [0.8125, 0.8750) + '{ 32'hBDB698C5, 32'h3F464FB9, 32'h3E1FB052 }, // [15] pos_oct1_sub6: [0.8750, 0.9375) + '{ 32'hBDE0DF89, 32'h3F5195CD, 32'h3E07A270 }, // [16] pos_oct1_sub7: [0.9375, 1.0000) + '{ 32'hBE13A45A, 32'h3F62F993, 32'h3DCA8921 }, // [17] pos_oct2_sub0: [1.0000, 1.1250) + '{ 32'hBE483CB7, 32'h3F7A5DFA, 32'h3D6E8A6E }, // [18] pos_oct2_sub1: [1.1250, 1.2500) + '{ 32'hBE7FAE79, 32'h3F8848C8, 32'h3CC080D0 }, // [19] pos_oct2_sub2: [1.2500, 1.3750) + '{ 32'hBE9AF2A1, 32'h3F9227C4, 32'hBB9670CC }, // [20] pos_oct2_sub3: [1.3750, 1.5000) + '{ 32'hBEB35AF6, 32'h3F9A4E45, 32'hBCD3D62F }, // [21] pos_oct2_sub4: [1.5000, 1.6250) + '{ 32'hBEC733B2, 32'h3FA06D28, 32'hBD26553D }, // [22] pos_oct2_sub5: [1.6250, 1.7500) + '{ 32'hBED511F1, 32'h3FA466BA, 32'hBD4ACBAB }, // [23] pos_oct2_sub6: [1.7500, 1.8750) + '{ 32'hBEDC2153, 32'h3FA64B63, 32'hBD5B0B08 }, // [24] pos_oct2_sub7: [1.8750, 2.0000) + '{ 32'hBED97943, 32'h3FA5A9DF, 32'hBD5641AE }, // [25] pos_oct3_sub0: [2.0000, 2.2500) + '{ 32'hBEC1A84E, 32'h3FA066D7, 32'hBD310ADA }, // [26] pos_oct3_sub1: [2.2500, 2.5000) + '{ 32'hBE9AF232, 32'h3F98AA14, 32'hBCFF13CB }, // [27] pos_oct3_sub2: [2.5000, 2.7500) + '{ 32'hBE609A95, 32'h3F90E6E9, 32'hBCA49F87 }, // [28] pos_oct3_sub3: [2.7500, 3.0000) + '{ 32'hBE145DCE, 32'h3F8A8925, 32'hBC411EC8 }, // [29] pos_oct3_sub4: [3.0000, 3.2500) + '{ 32'hBDB37CB4, 32'h3F8603AD, 32'hBBCFAA27 }, // [30] pos_oct3_sub5: [3.2500, 3.5000) + '{ 32'hBD47900C, 32'h3F8328BC, 32'hBB4DD2CD }, // [31] pos_oct3_sub6: [3.5000, 3.7500) + '{ 32'hBCCC8719, 32'h3F818839, 32'hBABCBE73 }, // [32] pos_oct3_sub7: [3.7500, 4.0000) + '{ 32'hBC075296, 32'h3F8077C5, 32'hB9D4797E }, // [33] pos_oct4_sub0: [4.0000, 4.5000) + '{ 32'hBAB5A3CC, 32'h3F801236, 32'hB869FE3F }, // [34] pos_oct4_sub1: [4.5000, 5.0000) + '{ 32'hB935F6EF, 32'h3F800215, 32'hB6C346B7 }, // [35] pos_oct4_sub2: [5.0000, 5.5000) + '{ 32'hB78928DE, 32'h3F80002E, 32'hB4F89CA8 }, // [36] pos_oct4_sub3: [5.5000, 6.0000) + '{ 32'hB59C87B9, 32'h3F800003, 32'hB2F2851B }, // [37] pos_oct4_sub4: [6.0000, 6.5000) + '{ 32'hB387DBFC, 32'h3F800000, 32'hB0B5DB02 }, // [38] pos_oct4_sub5: [6.5000, 7.0000) + '{ 32'hB133EE82, 32'h3F800000, 32'hAE520F13 }, // [39] pos_oct4_sub6: [7.0000, 7.5000) + '{ 32'hAEB983D5, 32'h3F800000, 32'hABBEA652 }, // [40] pos_oct4_sub7: [7.5000, 8.0000) + '{ 32'hBA7AD2E9, 32'h3EFB0E98, 32'h3EBE374D }, // [41] neg_oct0_sub0: [-0.2812, -0.2500) + '{ 32'hBAC20AB3, 32'h3EF9272A, 32'h3EBAD579 }, // [42] neg_oct0_sub1: [-0.3125, -0.2812) + '{ 32'hBB0F6AA9, 32'h3EF6D5E1, 32'h3EB720D6 }, // [43] neg_oct0_sub2: [-0.3438, -0.3125) + '{ 32'hBB4C32E1, 32'h3EF41328, 32'h3EB31D68 }, // [44] neg_oct0_sub3: [-0.3750, -0.3438) + '{ 32'hBB8CE121, 32'h3EF0D863, 32'h3EAECF81 }, // [45] neg_oct0_sub4: [-0.4062, -0.3750) + '{ 32'hBBBD44F5, 32'h3EED1FFC, 32'h3EAA3BB6 }, // [46] neg_oct0_sub5: [-0.4375, -0.4062) + '{ 32'hBBF87F2B, 32'h3EE8E573, 32'h3EA566DF }, // [47] neg_oct0_sub6: [-0.4688, -0.4375) + '{ 32'hBC1FE374, 32'h3EE42563, 32'h3EA05607 }, // [48] neg_oct0_sub7: [-0.5000, -0.4688) + '{ 32'hBC6196C5, 32'h3EDC0A33, 32'h3E985541 }, // [49] neg_oct1_sub0: [-0.5625, -0.5000) + '{ 32'hBCA9FA9B, 32'h3ECF57F0, 32'h3E8D0D86 }, // [50] neg_oct1_sub1: [-0.6250, -0.5625) + '{ 32'hBCF402A2, 32'h3EC08BED, 32'h3E813805 }, // [51] neg_oct1_sub2: [-0.6875, -0.6250) + '{ 32'hBD28346B, 32'h3EAFC10C, 32'h3E6A0426 }, // [52] neg_oct1_sub3: [-0.7500, -0.6875) + '{ 32'hBD6013B5, 32'h3E9D22AA, 32'h3E513158 }, // [53] neg_oct1_sub4: [-0.8125, -0.7500) + '{ 32'hBD90E5E4, 32'h3E88EAEA, 32'h3E384F09 }, // [54] neg_oct1_sub5: [-0.8750, -0.8125) + '{ 32'hBDB698C5, 32'h3E66C11A, 32'h3E1FB052 }, // [55] neg_oct1_sub6: [-0.9375, -0.8750) + '{ 32'hBDE0DF89, 32'h3E39A8CD, 32'h3E07A270 }, // [56] neg_oct1_sub7: [-1.0000, -0.9375) + '{ 32'hBE13A45A, 32'h3DE83369, 32'h3DCA8921 }, // [57] neg_oct2_sub0: [-1.1250, -1.0000) + '{ 32'hBE483CB7, 32'h3CB440CF, 32'h3D6E8A6E }, // [58] neg_oct2_sub1: [-1.2500, -1.1250) + '{ 32'hBE7FAE79, 32'hBD848C82, 32'h3CC080D0 }, // [59] neg_oct2_sub2: [-1.3750, -1.2500) + '{ 32'hBE9AF2A1, 32'hBE113E1E, 32'hBB9670CC }, // [60] neg_oct2_sub3: [-1.5000, -1.3750) + '{ 32'hBEB35AF6, 32'hBE52722A, 32'hBCD3D62F }, // [61] neg_oct2_sub4: [-1.6250, -1.5000) + '{ 32'hBEC733B2, 32'hBE81B4A1, 32'hBD26553D }, // [62] neg_oct2_sub5: [-1.7500, -1.6250) + '{ 32'hBED511F1, 32'hBE919AE6, 32'hBD4ACBAB }, // [63] neg_oct2_sub6: [-1.8750, -1.7500) + '{ 32'hBEDC2153, 32'hBE992D8A, 32'hBD5B0B08 }, // [64] neg_oct2_sub7: [-2.0000, -1.8750) + '{ 32'hBED97943, 32'hBE96A77B, 32'hBD5641AE }, // [65] neg_oct3_sub0: [-2.2500, -2.0000) + '{ 32'hBEC1A84E, 32'hBE819B5D, 32'hBD310ADA }, // [66] neg_oct3_sub1: [-2.5000, -2.2500) + '{ 32'hBE9AF232, 32'hBE4550A3, 32'hBCFF13CB }, // [67] neg_oct3_sub2: [-2.7500, -2.5000) + '{ 32'hBE609A95, 32'hBE073747, 32'hBCA49F87 }, // [68] neg_oct3_sub3: [-3.0000, -2.7500) + '{ 32'hBE145DCE, 32'hBDA8924A, 32'hBC411EC8 }, // [69] neg_oct3_sub4: [-3.2500, -3.0000) + '{ 32'hBDB37CB4, 32'hBD4075A8, 32'hBBCFAA27 }, // [70] neg_oct3_sub5: [-3.5000, -3.2500) + '{ 32'hBD47900C, 32'hBCCA2F0E, 32'hBB4DD2CD }, // [71] neg_oct3_sub6: [-3.7500, -3.5000) + '{ 32'hBCCC8719, 32'hBC441CB8, 32'hBABCBE73 }, // [72] neg_oct3_sub7: [-4.0000, -3.7500) + '{ 32'hBC075296, 32'hBB6F8A8D, 32'hB9D4797E }, // [73] neg_oct4_sub0: [-4.5000, -4.0000) + '{ 32'hBAB5A3CC, 32'hBA11AE73, 32'hB869FE3F }, // [74] neg_oct4_sub1: [-5.0000, -4.5000) + '{ 32'hB935F6EF, 32'hB8853C9B, 32'hB6C346B7 }, // [75] neg_oct4_sub2: [-5.5000, -5.0000) + '{ 32'hB78928DF, 32'hB6B89CD1, 32'hB4F89CA9 }, // [76] neg_oct4_sub3: [-6.0000, -5.5000) + '{ 32'hB59C87B7, 32'hB4C2CE38, 32'hB2F28516 }, // [77] neg_oct4_sub4: [-6.5000, -6.0000) + '{ 32'hB387DC68, 32'hB29D2B70, 32'hB0B5DB95 }, // [78] neg_oct4_sub5: [-7.0000, -6.5000) + '{ 32'hB1340659, 32'hB0427F9C, 32'hAE52296C }, // [79] neg_oct4_sub6: [-7.5000, -7.0000) + '{ 32'hAEB6A669, 32'hADB910C4, 32'hABBB86A1 } // [80] neg_oct4_sub7: [-8.0000, -7.5000) + } + }; + + localparam func_cfg_t SILU = '{ + neg_clamp: 32'h00000000, + pos_clamp: 32'h00000000, + pos_passthrough: 1, + coeffs: '{ + '{ 32'h36E8E4D8, 32'h3F000000, 32'h3E7EDCA9 }, // [0] near_zero: [-0.2500, 0.2500) + '{ 32'hB99F0E43, 32'h3F00C85E, 32'h3E771F38 }, // [1] pos_oct0_sub0: [0.2500, 0.2812) + '{ 32'hB9F6D43F, 32'h3F01164A, 32'h3E74F597 }, // [2] pos_oct0_sub1: [0.2812, 0.3125) + '{ 32'hBA370988, 32'h3F017596, 32'h3E729418 }, // [3] pos_oct0_sub2: [0.3125, 0.3438) + '{ 32'hBA82C874, 32'h3F01E7B8, 32'h3E6FFC71 }, // [4] pos_oct0_sub3: [0.3438, 0.3750) + '{ 32'hBAB52EA2, 32'h3F026E06, 32'h3E6D3079 }, // [5] pos_oct0_sub4: [0.3750, 0.4062) + '{ 32'hBAF47A83, 32'h3F0309BD, 32'h3E6A3227 }, // [6] pos_oct0_sub5: [0.4062, 0.4375) + '{ 32'hBB213FE4, 32'h3F03BBFC, 32'h3E670392 }, // [7] pos_oct0_sub6: [0.4375, 0.4688) + '{ 32'hBB508F23, 32'h3F0485C2, 32'h3E63A6E9 }, // [8] pos_oct0_sub7: [0.4688, 0.5000) + '{ 32'hBB945DC5, 32'h3F05E1DA, 32'h3E5E4861 }, // [9] pos_oct1_sub0: [0.5000, 0.5625) + '{ 32'hBBE25EA1, 32'h3F080BF4, 32'h3E569783 }, // [10] pos_oct1_sub1: [0.5625, 0.6250) + '{ 32'hBC24C1FC, 32'h3F0A9F8F, 32'h3E4E59C7 }, // [11] pos_oct1_sub2: [0.6250, 0.6875) + '{ 32'hBC66B247, 32'h3F0D9E5F, 32'h3E45A3CF }, // [12] pos_oct1_sub3: [0.6875, 0.7500) + '{ 32'hBC9C5235, 32'h3F11080C, 32'h3E3C8A91 }, // [13] pos_oct1_sub4: [0.7500, 0.8125) + '{ 32'hBCCE03BB, 32'h3F14DA46, 32'h3E332307 }, // [14] pos_oct1_sub5: [0.8125, 0.8750) + '{ 32'hBD048260, 32'h3F1910E8, 32'h3E2981D7 }, // [15] pos_oct1_sub6: [0.8750, 0.9375) + '{ 32'hBD26E393, 32'h3F1DA629, 32'h3E1FBB0A }, // [16] pos_oct1_sub7: [0.9375, 1.0000) + '{ 32'hBD637E84, 32'h3F252120, 32'h3E10F470 }, // [17] pos_oct2_sub0: [1.0000, 1.1250) + '{ 32'hBDA33C0B, 32'h3F301FB8, 32'h3DFACFC9 }, // [18] pos_oct2_sub1: [1.1250, 1.2500) + '{ 32'hBDDE6AD0, 32'h3F3BF5DC, 32'h3DD4EC26 }, // [19] pos_oct2_sub2: [1.2500, 1.3750) + '{ 32'hBE112138, 32'h3F484C26, 32'h3DB1044F }, // [20] pos_oct2_sub3: [1.3750, 1.5000) + '{ 32'hBE369C04, 32'h3F54CB60, 32'h3D8FAC7F }, // [21] pos_oct2_sub4: [1.5000, 1.6250) + '{ 32'hBE5EB18B, 32'h3F612208, 32'h3D629168 }, // [22] pos_oct2_sub5: [1.6250, 1.7500) + '{ 32'hBE842940, 32'h3F6D0854, 32'h3D2C2276 }, // [23] pos_oct2_sub6: [1.7500, 1.8750) + '{ 32'hBE9933D8, 32'h3F7842BB, 32'h3CF863D2 }, // [24] pos_oct2_sub7: [1.8750, 2.0000) + '{ 32'hBEB7B0C1, 32'h3F83AB39, 32'h3C8110DA }, // [25] pos_oct3_sub0: [2.0000, 2.2500) + '{ 32'hBEDD223E, 32'h3F8C0325, 32'h3AA08170 }, // [26] pos_oct3_sub1: [2.2500, 2.5000) + '{ 32'hBEFBE17D, 32'h3F922E48, 32'hBC0A6454 }, // [27] pos_oct3_sub2: [2.5000, 2.7500) + '{ 32'hBF093230, 32'h3F9649E6, 32'hBC6A5E21 }, // [28] pos_oct3_sub3: [2.7500, 3.0000) + '{ 32'hBF101F60, 32'h3F989BC7, 32'hBC8E0D50 }, // [29] pos_oct3_sub4: [3.0000, 3.2500) + '{ 32'hBF12EF0A, 32'h3F997B2D, 32'hBC96B8B5 }, // [30] pos_oct3_sub5: [3.2500, 3.5000) + '{ 32'hBF121E34, 32'h3F9940CD, 32'hBC94AEB1 }, // [31] pos_oct3_sub6: [3.5000, 3.7500) + '{ 32'hBF0E4E33, 32'h3F983D51, 32'hBC8C0ED6 }, // [32] pos_oct3_sub7: [3.7500, 4.0000) + '{ 32'hBF04791A, 32'h3F95D0ED, 32'hBC71E16F }, // [33] pos_oct4_sub0: [4.0000, 4.5000) + '{ 32'hBEE59935, 32'h3F91E3AD, 32'hBC3A058C }, // [34] pos_oct4_sub1: [4.5000, 5.0000) + '{ 32'hBEBEB440, 32'h3F8DFEE7, 32'hBC081CC1 }, // [35] pos_oct4_sub2: [5.0000, 5.5000) + '{ 32'hBE9948D3, 32'h3F8A96AC, 32'hBBC0C8D1 }, // [36] pos_oct4_sub3: [5.5000, 6.0000) + '{ 32'hBE70101E, 32'h3F87D004, 32'hBB8571D8 }, // [37] pos_oct4_sub4: [6.0000, 6.5000) + '{ 32'hBE3805CC, 32'h3F85A739, 32'hBB35ABD2 }, // [38] pos_oct4_sub5: [6.5000, 7.0000) + '{ 32'hBE0A951B, 32'h3F840701, 32'hBAF42F39 }, // [39] pos_oct4_sub6: [7.0000, 7.5000) + '{ 32'hBDCD9ACB, 32'h3F82D521, 32'hBAA27541 }, // [40] pos_oct4_sub7: [7.5000, 8.0000) + '{ 32'hB99F0E43, 32'h3EFE6F44, 32'h3E771F38 }, // [41] neg_oct0_sub0: [-0.2812, -0.2500) + '{ 32'hB9F6D43F, 32'h3EFDD36C, 32'h3E74F597 }, // [42] neg_oct0_sub1: [-0.3125, -0.2812) + '{ 32'hBA370988, 32'h3EFD14D3, 32'h3E729418 }, // [43] neg_oct0_sub2: [-0.3438, -0.3125) + '{ 32'hBA82C874, 32'h3EFC3091, 32'h3E6FFC71 }, // [44] neg_oct0_sub3: [-0.3750, -0.3438) + '{ 32'hBAB52EA2, 32'h3EFB23F5, 32'h3E6D3079 }, // [45] neg_oct0_sub4: [-0.4062, -0.3750) + '{ 32'hBAF47A83, 32'h3EF9EC85, 32'h3E6A3227 }, // [46] neg_oct0_sub5: [-0.4375, -0.4062) + '{ 32'hBB213FE4, 32'h3EF88807, 32'h3E670392 }, // [47] neg_oct0_sub6: [-0.4688, -0.4375) + '{ 32'hBB508F23, 32'h3EF6F47D, 32'h3E63A6E9 }, // [48] neg_oct0_sub7: [-0.5000, -0.4688) + '{ 32'hBB945DC5, 32'h3EF43C4C, 32'h3E5E4861 }, // [49] neg_oct1_sub0: [-0.5625, -0.5000) + '{ 32'hBBE25EA1, 32'h3EEFE817, 32'h3E569783 }, // [50] neg_oct1_sub1: [-0.6250, -0.5625) + '{ 32'hBC24C1FC, 32'h3EEAC0E2, 32'h3E4E59C7 }, // [51] neg_oct1_sub2: [-0.6875, -0.6250) + '{ 32'hBC66B247, 32'h3EE4C342, 32'h3E45A3CF }, // [52] neg_oct1_sub3: [-0.7500, -0.6875) + '{ 32'hBC9C5235, 32'h3EDDEFE8, 32'h3E3C8A91 }, // [53] neg_oct1_sub4: [-0.8125, -0.7500) + '{ 32'hBCCE03BB, 32'h3ED64B75, 32'h3E332307 }, // [54] neg_oct1_sub5: [-0.8750, -0.8125) + '{ 32'hBD048260, 32'h3ECDDE30, 32'h3E2981D7 }, // [55] neg_oct1_sub6: [-0.9375, -0.8750) + '{ 32'hBD26E393, 32'h3EC4B3AE, 32'h3E1FBB0A }, // [56] neg_oct1_sub7: [-1.0000, -0.9375) + '{ 32'hBD637E84, 32'h3EB5BDC0, 32'h3E10F470 }, // [57] neg_oct2_sub0: [-1.1250, -1.0000) + '{ 32'hBDA33C0B, 32'h3E9FC090, 32'h3DFACFC9 }, // [58] neg_oct2_sub1: [-1.2500, -1.1250) + '{ 32'hBDDE6AD0, 32'h3E881448, 32'h3DD4EC26 }, // [59] neg_oct2_sub2: [-1.3750, -1.2500) + '{ 32'hBE112138, 32'h3E5ECF69, 32'h3DB1044F }, // [60] neg_oct2_sub3: [-1.5000, -1.3750) + '{ 32'hBE369C04, 32'h3E2CD280, 32'h3D8FAC7F }, // [61] neg_oct2_sub4: [-1.6250, -1.5000) + '{ 32'hBE5EB18B, 32'h3DF6EFBF, 32'h3D629168 }, // [62] neg_oct2_sub5: [-1.7500, -1.6250) + '{ 32'hBE842940, 32'h3D97BD5C, 32'h3D2C2276 }, // [63] neg_oct2_sub6: [-1.8750, -1.7500) + '{ 32'hBE9933D8, 32'h3CF7A895, 32'h3CF863D2 }, // [64] neg_oct2_sub7: [-2.0000, -1.8750) + '{ 32'hBEB7B0C1, 32'hBCEACE5E, 32'h3C8110DA }, // [65] neg_oct3_sub0: [-2.2500, -2.0000) + '{ 32'hBEDD223E, 32'hBDC03253, 32'h3AA08170 }, // [66] neg_oct3_sub1: [-2.5000, -2.2500) + '{ 32'hBEFBE17D, 32'hBE11723E, 32'hBC0A6454 }, // [67] neg_oct3_sub2: [-2.7500, -2.5000) + '{ 32'hBF093230, 32'hBE324F32, 32'hBC6A5E21 }, // [68] neg_oct3_sub3: [-3.0000, -2.7500) + '{ 32'hBF101F60, 32'hBE44DE3B, 32'hBC8E0D50 }, // [69] neg_oct3_sub4: [-3.2500, -3.0000) + '{ 32'hBF12EF0A, 32'hBE4BD96A, 32'hBC96B8B5 }, // [70] neg_oct3_sub5: [-3.5000, -3.2500) + '{ 32'hBF121E34, 32'hBE4A0667, 32'hBC94AEB1 }, // [71] neg_oct3_sub6: [-3.7500, -3.5000) + '{ 32'hBF0E4E33, 32'hBE41EA8C, 32'hBC8C0ED6 }, // [72] neg_oct3_sub7: [-4.0000, -3.7500) + '{ 32'hBF04791A, 32'hBE2E8765, 32'hBC71E16F }, // [73] neg_oct4_sub0: [-4.5000, -4.0000) + '{ 32'hBEE59935, 32'hBE0F1D69, 32'hBC3A058C }, // [74] neg_oct4_sub1: [-5.0000, -4.5000) + '{ 32'hBEBEB440, 32'hBDDFEE68, 32'hBC081CC1 }, // [75] neg_oct4_sub2: [-5.5000, -5.0000) + '{ 32'hBE9948D3, 32'hBDA96ABE, 32'hBBC0C8D1 }, // [76] neg_oct4_sub3: [-6.0000, -5.5000) + '{ 32'hBE70101E, 32'hBD7A008A, 32'hBB8571D8 }, // [77] neg_oct4_sub4: [-6.5000, -6.0000) + '{ 32'hBE3805CC, 32'hBD34E717, 32'hBB35ABD2 }, // [78] neg_oct4_sub5: [-7.0000, -6.5000) + '{ 32'hBE0A951B, 32'hBD00E023, 32'hBAF42F39 }, // [79] neg_oct4_sub6: [-7.5000, -7.0000) + '{ 32'hBDCD9ACB, 32'hBCB54833, 32'hBAA27541 } // [80] neg_oct4_sub7: [-8.0000, -7.5000) + } + }; + + localparam func_cfg_t SIGMOID = '{ + neg_clamp: 32'h00000000, + pos_clamp: 32'h3F800000, + pos_passthrough: 0, + coeffs: '{ + '{ 32'h3F000000, 32'h3E7F33E9, 32'h00000000 }, // [0] near_zero: [-0.2500, 0.2500) + '{ 32'h3EFFCF0E, 32'h3E822D89, 32'hBC84D823 }, // [1] pos_oct0_sub0: [0.2500, 0.2812) + '{ 32'h3EFFBC5D, 32'h3E82B26C, 32'hBC939CA5 }, // [2] pos_oct0_sub1: [0.2812, 0.3125) + '{ 32'h3EFFA5B9, 32'h3E834349, 32'hBCA219C6 }, // [3] pos_oct0_sub2: [0.3125, 0.3438) + '{ 32'h3EFF8AE8, 32'h3E83DF4C, 32'hBCB04918 }, // [4] pos_oct0_sub3: [0.3438, 0.3750) + '{ 32'h3EFF6BBB, 32'h3E84858E, 32'hBCBE247C }, // [5] pos_oct0_sub4: [0.3750, 0.4062) + '{ 32'h3EFF4812, 32'h3E85351C, 32'hBCCBA623 }, // [6] pos_oct0_sub5: [0.4062, 0.4375) + '{ 32'h3EFF1FDA, 32'h3E85ECF7, 32'hBCD8C899 }, // [7] pos_oct0_sub6: [0.4375, 0.4688) + '{ 32'h3EFEF30F, 32'h3E86AC14, 32'hBCE586C5 }, // [8] pos_oct0_sub7: [0.4688, 0.5000) + '{ 32'h3EFEA7FD, 32'h3E87D4B2, 32'hBCF7D7C9 }, // [9] pos_oct1_sub0: [0.5000, 0.5625) + '{ 32'h3EFE340F, 32'h3E897100, 32'hBD0761C6 }, // [10] pos_oct1_sub1: [0.5625, 0.6250) + '{ 32'h3EFDB07F, 32'h3E8B1626, 32'hBD11EAFB }, // [11] pos_oct1_sub2: [0.6250, 0.6875) + '{ 32'h3EFD1FEF, 32'h3E8CBAE0, 32'hBD1B7C9A }, // [12] pos_oct1_sub3: [0.6875, 0.7500) + '{ 32'h3EFC85D6, 32'h3E8E5602, 32'hBD240EF3 }, // [13] pos_oct1_sub4: [0.7500, 0.8125) + '{ 32'h3EFBE66B, 32'h3E8FDEA9, 32'hBD2B9D89 }, // [14] pos_oct1_sub5: [0.8125, 0.8750) + '{ 32'h3EFB4684, 32'h3E914C67, 32'hBD3226EB }, // [15] pos_oct1_sub6: [0.8750, 0.9375) + '{ 32'h3EFAAB7A, 32'h3E929769, 32'hBD37AC88 }, // [16] pos_oct1_sub7: [0.9375, 1.0000) + '{ 32'h3EF9DB97, 32'h3E9432F4, 32'hBD3E0A59 }, // [17] pos_oct2_sub0: [1.0000, 1.1250) + '{ 32'h3EF90A04, 32'h3E95A99D, 32'hBD434674 }, // [18] pos_oct2_sub1: [1.1250, 1.2500) + '{ 32'h3EF8B5B0, 32'h3E963283, 32'hBD4502F9 }, // [19] pos_oct2_sub2: [1.2500, 1.3750) + '{ 32'h3EF90A01, 32'h3E95B9A5, 32'hBD43A873 }, // [20] pos_oct2_sub3: [1.3750, 1.5000) + '{ 32'h3EFA2B30, 32'h3E94399D, 32'hBD3FAC42 }, // [21] pos_oct2_sub4: [1.5000, 1.6250) + '{ 32'h3EFC3409, 32'h3E91B9DE, 32'hBD39885F }, // [22] pos_oct2_sub5: [1.6250, 1.7500) + '{ 32'h3EFF34CB, 32'h3E8E4C54, 32'hBD31B499 }, // [23] pos_oct2_sub6: [1.7500, 1.8750) + '{ 32'h3F019987, 32'h3E8A0AB5, 32'hBD28A16E }, // [24] pos_oct2_sub7: [1.8750, 2.0000) + '{ 32'h3F057A9E, 32'h3E8262A4, 32'hBD1983F9 }, // [25] pos_oct3_sub0: [2.0000, 2.2500) + '{ 32'h3F0C3068, 32'h3E6CECEE, 32'hBD0452A4 }, // [26] pos_oct3_sub1: [2.2500, 2.5000) + '{ 32'h3F1447FB, 32'h3E530738, 32'hBCDF3123 }, // [27] pos_oct3_sub2: [2.5000, 2.7500) + '{ 32'h3F1D4A57, 32'h3E38CEB4, 32'hBCB90648 }, // [28] pos_oct3_sub3: [2.7500, 3.0000) + '{ 32'h3F26C116, 32'h3E1F8DBB, 32'hBC975293 }, // [29] pos_oct3_sub4: [3.0000, 3.2500) + '{ 32'h3F30448D, 32'h3E081E0B, 32'hBC74E633 }, // [30] pos_oct3_sub5: [3.2500, 3.5000) + '{ 32'h3F398293, 32'h3DE5F2D2, 32'hBC4485E9 }, // [31] pos_oct3_sub6: [3.5000, 3.7500) + '{ 32'h3F423FDE, 32'h3DC0A0B4, 32'hBC1CAC71 }, // [32] pos_oct3_sub7: [3.7500, 4.0000) + '{ 32'h3F4DF46F, 32'h3D924A89, 32'hBBDD9C65 }, // [33] pos_oct4_sub0: [4.0000, 4.5000) + '{ 32'h3F5B2AC0, 32'h3D464E5F, 32'hBB897CEB }, // [34] pos_oct4_sub1: [4.5000, 5.0000) + '{ 32'h3F656FD2, 32'h3D045D55, 32'hBB291712 }, // [35] pos_oct4_sub2: [5.0000, 5.5000) + '{ 32'h3F6D25B4, 32'h3CAEBB3F, 32'hBACED637 }, // [36] pos_oct4_sub3: [5.5000, 6.0000) + '{ 32'h3F72CA7A, 32'h3C64B7EA, 32'hBA7C2E52 }, // [37] pos_oct4_sub4: [6.0000, 6.5000) + '{ 32'h3F76D7C3, 32'h3C14B63C, 32'hBA196D41 }, // [38] pos_oct4_sub5: [6.5000, 7.0000) + '{ 32'h3F79B550, 32'h3BC05DAC, 32'hB9BA76BE }, // [39] pos_oct4_sub6: [7.0000, 7.5000) + '{ 32'h3F7BB5C3, 32'h3B77C0C9, 32'hB96272C7 }, // [40] pos_oct4_sub7: [7.5000, 8.0000) + '{ 32'h3F001879, 32'h3E822D89, 32'h3C84D823 }, // [41] neg_oct0_sub0: [-0.2812, -0.2500) + '{ 32'h3F0021D2, 32'h3E82B26C, 32'h3C939CA5 }, // [42] neg_oct0_sub1: [-0.3125, -0.2812) + '{ 32'h3F002D23, 32'h3E834349, 32'h3CA219C6 }, // [43] neg_oct0_sub2: [-0.3438, -0.3125) + '{ 32'h3F003A8C, 32'h3E83DF4C, 32'h3CB04918 }, // [44] neg_oct0_sub3: [-0.3750, -0.3438) + '{ 32'h3F004A22, 32'h3E84858E, 32'h3CBE247C }, // [45] neg_oct0_sub4: [-0.4062, -0.3750) + '{ 32'h3F005BF7, 32'h3E85351C, 32'h3CCBA623 }, // [46] neg_oct0_sub5: [-0.4375, -0.4062) + '{ 32'h3F007013, 32'h3E85ECF7, 32'h3CD8C899 }, // [47] neg_oct0_sub6: [-0.4688, -0.4375) + '{ 32'h3F008678, 32'h3E86AC14, 32'h3CE586C5 }, // [48] neg_oct0_sub7: [-0.5000, -0.4688) + '{ 32'h3F00AC01, 32'h3E87D4B2, 32'h3CF7D7C9 }, // [49] neg_oct1_sub0: [-0.5625, -0.5000) + '{ 32'h3F00E5F9, 32'h3E897100, 32'h3D0761C6 }, // [50] neg_oct1_sub1: [-0.6250, -0.5625) + '{ 32'h3F0127C1, 32'h3E8B1626, 32'h3D11EAFB }, // [51] neg_oct1_sub2: [-0.6875, -0.6250) + '{ 32'h3F017009, 32'h3E8CBAE0, 32'h3D1B7C9A }, // [52] neg_oct1_sub3: [-0.7500, -0.6875) + '{ 32'h3F01BD15, 32'h3E8E5602, 32'h3D240EF3 }, // [53] neg_oct1_sub4: [-0.8125, -0.7500) + '{ 32'h3F020CCB, 32'h3E8FDEA9, 32'h3D2B9D89 }, // [54] neg_oct1_sub5: [-0.8750, -0.8125) + '{ 32'h3F025CBE, 32'h3E914C67, 32'h3D3226EB }, // [55] neg_oct1_sub6: [-0.9375, -0.8750) + '{ 32'h3F02AA43, 32'h3E929769, 32'h3D37AC88 }, // [56] neg_oct1_sub7: [-1.0000, -0.9375) + '{ 32'h3F031234, 32'h3E9432F4, 32'h3D3E0A59 }, // [57] neg_oct2_sub0: [-1.1250, -1.0000) + '{ 32'h3F037AFE, 32'h3E95A99D, 32'h3D434674 }, // [58] neg_oct2_sub1: [-1.2500, -1.1250) + '{ 32'h3F03A528, 32'h3E963283, 32'h3D4502F9 }, // [59] neg_oct2_sub2: [-1.3750, -1.2500) + '{ 32'h3F037AFF, 32'h3E95B9A5, 32'h3D43A873 }, // [60] neg_oct2_sub3: [-1.5000, -1.3750) + '{ 32'h3F02EA68, 32'h3E94399D, 32'h3D3FAC42 }, // [61] neg_oct2_sub4: [-1.6250, -1.5000) + '{ 32'h3F01E5FC, 32'h3E91B9DE, 32'h3D39885F }, // [62] neg_oct2_sub5: [-1.7500, -1.6250) + '{ 32'h3F00659B, 32'h3E8E4C54, 32'h3D31B499 }, // [63] neg_oct2_sub6: [-1.8750, -1.7500) + '{ 32'h3EFCCCF2, 32'h3E8A0AB5, 32'h3D28A16E }, // [64] neg_oct2_sub7: [-2.0000, -1.8750) + '{ 32'h3EF50AC5, 32'h3E8262A4, 32'h3D1983F9 }, // [65] neg_oct3_sub0: [-2.2500, -2.0000) + '{ 32'h3EE79F30, 32'h3E6CECEE, 32'h3D0452A4 }, // [66] neg_oct3_sub1: [-2.5000, -2.2500) + '{ 32'h3ED7700A, 32'h3E530738, 32'h3CDF3123 }, // [67] neg_oct3_sub2: [-2.7500, -2.5000) + '{ 32'h3EC56B51, 32'h3E38CEB4, 32'h3CB90648 }, // [68] neg_oct3_sub3: [-3.0000, -2.7500) + '{ 32'h3EB27DD3, 32'h3E1F8DBB, 32'h3C975293 }, // [69] neg_oct3_sub4: [-3.2500, -3.0000) + '{ 32'h3E9F76E6, 32'h3E081E0B, 32'h3C74E633 }, // [70] neg_oct3_sub5: [-3.5000, -3.2500) + '{ 32'h3E8CFAD9, 32'h3DE5F2D2, 32'h3C4485E9 }, // [71] neg_oct3_sub6: [-3.7500, -3.5000) + '{ 32'h3E770088, 32'h3DC0A0B4, 32'h3C1CAC71 }, // [72] neg_oct3_sub7: [-4.0000, -3.7500) + '{ 32'h3E482E45, 32'h3D924A89, 32'h3BDD9C65 }, // [73] neg_oct4_sub0: [-4.5000, -4.0000) + '{ 32'h3E1354FF, 32'h3D464E5F, 32'h3B897CEB }, // [74] neg_oct4_sub1: [-5.0000, -4.5000) + '{ 32'h3DD48171, 32'h3D045D55, 32'h3B291712 }, // [75] neg_oct4_sub2: [-5.5000, -5.0000) + '{ 32'h3D96D261, 32'h3CAEBB3F, 32'h3ACED637 }, // [76] neg_oct4_sub3: [-6.0000, -5.5000) + '{ 32'h3D535864, 32'h3C64B7EA, 32'h3A7C2E52 }, // [77] neg_oct4_sub4: [-6.5000, -6.0000) + '{ 32'h3D1283CA, 32'h3C14B63C, 32'h3A196D41 }, // [78] neg_oct4_sub5: [-7.0000, -6.5000) + '{ 32'h3CC95606, 32'h3BC05DAC, 32'h39BA76BE }, // [79] neg_oct4_sub6: [-7.5000, -7.0000) + '{ 32'h3C894795, 32'h3B77C0C9, 32'h396272C7 } // [80] neg_oct4_sub7: [-8.0000, -7.5000) + } + }; + + localparam func_cfg_t TANH = '{ + neg_clamp: 32'hBF800000, + pos_clamp: 32'h3F800000, + pos_passthrough: 0, + coeffs: '{ + '{ 32'hA1B504F3, 32'h3F7CDA60, 32'h00000000 }, // [0] near_zero: [-0.2500, 0.2500) + '{ 32'hBBAC0178, 32'h3F87D4B2, 32'hBE77D7C9 }, // [1] pos_oct0_sub0: [0.2500, 0.2812) + '{ 32'hBBE5F89C, 32'h3F897100, 32'hBE8761C6 }, // [2] pos_oct0_sub1: [0.2812, 0.3125) + '{ 32'hBC13E055, 32'h3F8B1626, 32'hBE91EAFB }, // [3] pos_oct0_sub2: [0.3125, 0.3438) + '{ 32'hBC38044C, 32'h3F8CBAE0, 32'hBE9B7C9A }, // [4] pos_oct0_sub3: [0.3438, 0.3750) + '{ 32'hBC5E8A68, 32'h3F8E5602, 32'hBEA40EF3 }, // [5] pos_oct0_sub4: [0.3750, 0.4062) + '{ 32'hBC8332A5, 32'h3F8FDEA9, 32'hBEAB9D89 }, // [6] pos_oct0_sub5: [0.4062, 0.4375) + '{ 32'hBC972F8C, 32'h3F914C67, 32'hBEB226EB }, // [7] pos_oct0_sub6: [0.4375, 0.4688) + '{ 32'hBCAA90CC, 32'h3F929769, 32'hBEB7AC88 }, // [8] pos_oct0_sub7: [0.4688, 0.5000) + '{ 32'hBCC48D18, 32'h3F9432F4, 32'hBEBE0A59 }, // [9] pos_oct1_sub0: [0.5000, 0.5625) + '{ 32'hBCDEBF75, 32'h3F95A99D, 32'hBEC34674 }, // [10] pos_oct1_sub1: [0.5625, 0.6250) + '{ 32'hBCE94A05, 32'h3F963283, 32'hBEC502F9 }, // [11] pos_oct1_sub2: [0.6250, 0.6875) + '{ 32'hBCDEBFD9, 32'h3F95B9A5, 32'hBEC3A873 }, // [12] pos_oct1_sub3: [0.6875, 0.7500) + '{ 32'hBCBA99F8, 32'h3F94399D, 32'hBEBFAC42 }, // [13] pos_oct1_sub4: [0.7500, 0.8125) + '{ 32'hBC72FDCF, 32'h3F91B9DE, 32'hBEB9885F }, // [14] pos_oct1_sub5: [0.8125, 0.8750) + '{ 32'hBB4B3567, 32'h3F8E4C54, 32'hBEB1B499 }, // [15] pos_oct1_sub6: [0.8750, 0.9375) + '{ 32'h3C4CC375, 32'h3F8A0AB5, 32'hBEA8A16E }, // [16] pos_oct1_sub7: [0.9375, 1.0000) + '{ 32'h3D2F53B5, 32'h3F8262A4, 32'hBE9983F9 }, // [17] pos_oct2_sub0: [1.0000, 1.1250) + '{ 32'h3DC3067D, 32'h3F6CECEE, 32'hBE8452A4 }, // [18] pos_oct2_sub1: [1.1250, 1.2500) + '{ 32'h3E223FD6, 32'h3F530738, 32'hBE5F3123 }, // [19] pos_oct2_sub2: [1.2500, 1.3750) + '{ 32'h3E6A52BB, 32'h3F38CEB4, 32'hBE390648 }, // [20] pos_oct2_sub3: [1.3750, 1.5000) + '{ 32'h3E9B0459, 32'h3F1F8DBB, 32'hBE175293 }, // [21] pos_oct2_sub4: [1.5000, 1.6250) + '{ 32'h3EC11234, 32'h3F081E0B, 32'hBDF4E633 }, // [22] pos_oct2_sub5: [1.6250, 1.7500) + '{ 32'h3EE60A4E, 32'h3EE5F2D2, 32'hBDC485E9 }, // [23] pos_oct2_sub6: [1.7500, 1.8750) + '{ 32'h3F047FBC, 32'h3EC0A0B4, 32'hBD9CAC71 }, // [24] pos_oct2_sub7: [1.8750, 2.0000) + '{ 32'h3F1BE8DE, 32'h3E924A89, 32'hBD5D9C65 }, // [25] pos_oct3_sub0: [2.0000, 2.2500) + '{ 32'h3F365580, 32'h3E464E5F, 32'hBD097CEB }, // [26] pos_oct3_sub1: [2.2500, 2.5000) + '{ 32'h3F4ADFA4, 32'h3E045D55, 32'hBCA91712 }, // [27] pos_oct3_sub2: [2.5000, 2.7500) + '{ 32'h3F5A4B68, 32'h3DAEBB3F, 32'hBC4ED637 }, // [28] pos_oct3_sub3: [2.7500, 3.0000) + '{ 32'h3F6594F3, 32'h3D64B7EA, 32'hBBFC2E52 }, // [29] pos_oct3_sub4: [3.0000, 3.2500) + '{ 32'h3F6DAF87, 32'h3D14B63C, 32'hBB996D41 }, // [30] pos_oct3_sub5: [3.2500, 3.5000) + '{ 32'h3F736AA0, 32'h3CC05DAC, 32'hBB3A76BE }, // [31] pos_oct3_sub6: [3.5000, 3.7500) + '{ 32'h3F776B87, 32'h3C77C0C9, 32'hBAE272C7 }, // [32] pos_oct3_sub7: [3.7500, 4.0000) + '{ 32'h3F7B291E, 32'h3C00F319, 32'hBA590184 }, // [33] pos_oct4_sub0: [4.0000, 4.5000) + '{ 32'h3F7DD41A, 32'h3B51D16D, 32'hB99FC02C }, // [34] pos_oct4_sub1: [4.5000, 5.0000) + '{ 32'h3F7F0B06, 32'h3AA9199D, 32'hB8EB1F74 }, // [35] pos_oct4_sub2: [5.0000, 5.5000) + '{ 32'h3F7F95A4, 32'h3A073D0F, 32'hB82D01A6 }, // [36] pos_oct4_sub3: [5.5000, 6.0000) + '{ 32'h3F7FD268, 32'h3956EC40, 32'hB77E96C4 }, // [37] pos_oct4_sub4: [6.0000, 6.5000) + '{ 32'h3F7FECAA, 32'h38A9D71D, 32'hB6BB5164 }, // [38] pos_oct4_sub5: [6.5000, 7.0000) + '{ 32'h3F7FF7E0, 32'h38059366, 32'hB609D243 }, // [39] pos_oct4_sub6: [7.0000, 7.5000) + '{ 32'h3F7FFC9E, 32'h37513C1C, 32'hB54ACE8B }, // [40] pos_oct4_sub7: [7.5000, 8.0000) + '{ 32'h3BAC0178, 32'h3F87D4B2, 32'h3E77D7C9 }, // [41] neg_oct0_sub0: [-0.2812, -0.2500) + '{ 32'h3BE5F89C, 32'h3F897100, 32'h3E8761C6 }, // [42] neg_oct0_sub1: [-0.3125, -0.2812) + '{ 32'h3C13E055, 32'h3F8B1626, 32'h3E91EAFB }, // [43] neg_oct0_sub2: [-0.3438, -0.3125) + '{ 32'h3C38044C, 32'h3F8CBAE0, 32'h3E9B7C9A }, // [44] neg_oct0_sub3: [-0.3750, -0.3438) + '{ 32'h3C5E8A68, 32'h3F8E5602, 32'h3EA40EF3 }, // [45] neg_oct0_sub4: [-0.4062, -0.3750) + '{ 32'h3C8332A5, 32'h3F8FDEA9, 32'h3EAB9D89 }, // [46] neg_oct0_sub5: [-0.4375, -0.4062) + '{ 32'h3C972F8C, 32'h3F914C67, 32'h3EB226EB }, // [47] neg_oct0_sub6: [-0.4688, -0.4375) + '{ 32'h3CAA90CC, 32'h3F929769, 32'h3EB7AC88 }, // [48] neg_oct0_sub7: [-0.5000, -0.4688) + '{ 32'h3CC48D18, 32'h3F9432F4, 32'h3EBE0A59 }, // [49] neg_oct1_sub0: [-0.5625, -0.5000) + '{ 32'h3CDEBF75, 32'h3F95A99D, 32'h3EC34674 }, // [50] neg_oct1_sub1: [-0.6250, -0.5625) + '{ 32'h3CE94A05, 32'h3F963283, 32'h3EC502F9 }, // [51] neg_oct1_sub2: [-0.6875, -0.6250) + '{ 32'h3CDEBFD9, 32'h3F95B9A5, 32'h3EC3A873 }, // [52] neg_oct1_sub3: [-0.7500, -0.6875) + '{ 32'h3CBA99F8, 32'h3F94399D, 32'h3EBFAC42 }, // [53] neg_oct1_sub4: [-0.8125, -0.7500) + '{ 32'h3C72FDCF, 32'h3F91B9DE, 32'h3EB9885F }, // [54] neg_oct1_sub5: [-0.8750, -0.8125) + '{ 32'h3B4B3567, 32'h3F8E4C54, 32'h3EB1B499 }, // [55] neg_oct1_sub6: [-0.9375, -0.8750) + '{ 32'hBC4CC375, 32'h3F8A0AB5, 32'h3EA8A16E }, // [56] neg_oct1_sub7: [-1.0000, -0.9375) + '{ 32'hBD2F53B5, 32'h3F8262A4, 32'h3E9983F9 }, // [57] neg_oct2_sub0: [-1.1250, -1.0000) + '{ 32'hBDC3067D, 32'h3F6CECEE, 32'h3E8452A4 }, // [58] neg_oct2_sub1: [-1.2500, -1.1250) + '{ 32'hBE223FD6, 32'h3F530738, 32'h3E5F3123 }, // [59] neg_oct2_sub2: [-1.3750, -1.2500) + '{ 32'hBE6A52BB, 32'h3F38CEB4, 32'h3E390648 }, // [60] neg_oct2_sub3: [-1.5000, -1.3750) + '{ 32'hBE9B0459, 32'h3F1F8DBB, 32'h3E175293 }, // [61] neg_oct2_sub4: [-1.6250, -1.5000) + '{ 32'hBEC11234, 32'h3F081E0B, 32'h3DF4E633 }, // [62] neg_oct2_sub5: [-1.7500, -1.6250) + '{ 32'hBEE60A4E, 32'h3EE5F2D2, 32'h3DC485E9 }, // [63] neg_oct2_sub6: [-1.8750, -1.7500) + '{ 32'hBF047FBC, 32'h3EC0A0B4, 32'h3D9CAC71 }, // [64] neg_oct2_sub7: [-2.0000, -1.8750) + '{ 32'hBF1BE8DE, 32'h3E924A89, 32'h3D5D9C65 }, // [65] neg_oct3_sub0: [-2.2500, -2.0000) + '{ 32'hBF365580, 32'h3E464E5F, 32'h3D097CEB }, // [66] neg_oct3_sub1: [-2.5000, -2.2500) + '{ 32'hBF4ADFA4, 32'h3E045D55, 32'h3CA91712 }, // [67] neg_oct3_sub2: [-2.7500, -2.5000) + '{ 32'hBF5A4B68, 32'h3DAEBB3F, 32'h3C4ED637 }, // [68] neg_oct3_sub3: [-3.0000, -2.7500) + '{ 32'hBF6594F3, 32'h3D64B7EA, 32'h3BFC2E52 }, // [69] neg_oct3_sub4: [-3.2500, -3.0000) + '{ 32'hBF6DAF87, 32'h3D14B63C, 32'h3B996D41 }, // [70] neg_oct3_sub5: [-3.5000, -3.2500) + '{ 32'hBF736AA0, 32'h3CC05DAC, 32'h3B3A76BE }, // [71] neg_oct3_sub6: [-3.7500, -3.5000) + '{ 32'hBF776B87, 32'h3C77C0C9, 32'h3AE272C7 }, // [72] neg_oct3_sub7: [-4.0000, -3.7500) + '{ 32'hBF7B291E, 32'h3C00F319, 32'h3A590184 }, // [73] neg_oct4_sub0: [-4.5000, -4.0000) + '{ 32'hBF7DD41A, 32'h3B51D16D, 32'h399FC02C }, // [74] neg_oct4_sub1: [-5.0000, -4.5000) + '{ 32'hBF7F0B06, 32'h3AA9199D, 32'h38EB1F74 }, // [75] neg_oct4_sub2: [-5.5000, -5.0000) + '{ 32'hBF7F95A4, 32'h3A073D0F, 32'h382D01A6 }, // [76] neg_oct4_sub3: [-6.0000, -5.5000) + '{ 32'hBF7FD268, 32'h3956EC40, 32'h377E96C4 }, // [77] neg_oct4_sub4: [-6.5000, -6.0000) + '{ 32'hBF7FECAA, 32'h38A9D71D, 32'h36BB5164 }, // [78] neg_oct4_sub5: [-7.0000, -6.5000) + '{ 32'hBF7FF7E0, 32'h38059366, 32'h3609D243 }, // [79] neg_oct4_sub6: [-7.5000, -7.0000) + '{ 32'hBF7FFC9E, 32'h37513C1C, 32'h354ACE8B } // [80] neg_oct4_sub7: [-8.0000, -7.5000) + } + }; + +endpackage \ No newline at end of file diff --git a/finn-rtllib/pwpolyf/hdl/pwpolyf_tb.sv b/finn-rtllib/pwpolyf/hdl/pwpolyf_tb.sv new file mode 100644 index 0000000000..574b073b3e --- /dev/null +++ b/finn-rtllib/pwpolyf/hdl/pwpolyf_tb.sv @@ -0,0 +1,145 @@ +/**************************************************************************** + * Copyright (C) 2026, Advanced Micro Devices, Inc. + * All rights reserved. + * + * SPDX-License-Identifier: BSD-3-Clause + * + * @brief Testbench for pwpolyf: FP32 piecewise polynomial activation. + * @author Shane Fleming + * + * @description + * Tests all four activation functions (gelu, silu, sigmoid, tanh) in + * parallel using random FP32 stimulus with online shortreal-based + * checking against a reference function. + ***************************************************************************/ + +module pwpolyf_tb; + + localparam int unsigned TEST_COUNT = 4; + localparam string FUNCS[TEST_COUNT] = '{"gelu", "silu", "sigmoid", "tanh"}; + localparam int unsigned RUNS = 4096; + + // Global Control + logic clk = 0; + always #5ns clk = !clk; + logic rst = 1; + initial begin + repeat(12) @(posedge clk); + rst <= 0; + end + + bit [TEST_COUNT-1:0] done = '0; + always_comb begin + if(&done) $finish; + end + + for(genvar t = 0; t < TEST_COUNT; t++) begin : genTests + localparam string FUNC = FUNCS[t]; + + // DUT wired for PE=1 + logic [31:0] xdat; + logic xvld; + uwire xrdy; + uwire [31:0] ydat; + uwire yvld; + logic yrdy; + + pwpolyf #(.PE(1), .FUNC(FUNC)) dut ( + .clk, .rst, + .xdat, .xvld, .xrdy, + .ydat, .yvld, .yrdy + ); + shortreal y; + assign y = $bitstoshortreal(ydat); + + // Reference function -- compute in real, cast to shortreal + function automatic shortreal ref_func(input shortreal x); + automatic real xr = real'(x); + automatic real yr; + if(xr >= 8.0) + return (FUNC == "gelu" || FUNC == "silu")? x : shortreal'(1.0); + if(xr <= -8.0) + return (FUNC == "tanh")? shortreal'(-1.0) : shortreal'(0.0); + if(FUNC == "gelu") begin + automatic real t = $tanh($sqrt(2.0/3.14159265358979) * (xr + 0.044715*xr*xr*xr)); + yr = 0.5 * xr * (1.0 + t); + end + else if(FUNC == "silu") yr = xr / (1.0 + $exp(-xr)); + else if(FUNC == "sigmoid") yr = 1.0 / (1.0 + $exp(-xr)); + else yr = $tanh(xr); + return shortreal'(yr); + endfunction + + // Online checking state + shortreal ExpQ[$]; + + // Stimulus driver + initial begin + xdat = '0; + xvld = 0; + @(posedge clk iff !rst); + + repeat(RUNS) begin + automatic logic [31:0] vbits; + + // Cover range [-8, 8) across all 5 octaves (exp 125..129) + vbits = 32'h40000000 + ($urandom() % 32'h01800000); // [2.0, 6.0) range + if($urandom() % 2) vbits[31] = 1; // random sign + if($urandom() % 4 == 0) vbits = 32'h3F800000; // 1.0 + if($urandom() % 8 == 0) vbits = 32'h00000000; // 0.0 + if($urandom() % 8 == 0) vbits = 32'h40E00000 | ($urandom() % 32'h00100000); // [7.0, 7.5) + + while($urandom() % 17 == 0) @(posedge clk); + + xdat <= vbits; + xvld <= 1; + + @(posedge clk iff xrdy); + ExpQ.push_back(ref_func($bitstoshortreal(vbits))); + + xvld <= 0; + end + end + + always_ff @(posedge clk iff yvld && yrdy) begin + automatic shortreal exp, err; + assert(ExpQ.size) else begin + $error("[%s] Spurious output.", FUNC); + $stop; + end + exp = ExpQ.pop_front(); + err = y - exp; + err *= err; + assert((err < 1e-3) || ($shortrealtobits(y) == $shortrealtobits(exp))) else begin + $error("[%s] Output mismatch: %f/%08x instead of %f/%08x", + FUNC, y, $shortrealtobits(y), exp, $shortrealtobits(exp)); + $stop; + end + end + + // Output collector -- drives yrdy backpressure + initial begin + yrdy = 0; + @(posedge clk iff !rst); + + repeat(RUNS) begin + while($urandom() % 17 == 0) @(posedge clk); + yrdy <= 1; + @(posedge clk iff yvld); + yrdy <= 0; + end + + // Verify all expected outputs were consumed + @(posedge clk); + assert(ExpQ.size() == 0) else begin + $error("[%s] Missing %0d outputs.", FUNC, ExpQ.size()); + $stop; + end + + $display("PWPOLYF[%s]: %0d outputs verified online.", FUNC, RUNS); + done[t] = 1; + end + + end : genTests + +endmodule : pwpolyf_tb \ No newline at end of file diff --git a/src/finn/custom_op/fpgadataflow/rtl/pwpolyf_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/pwpolyf_rtl.py index d4736f7fee..dccc9e8240 100644 --- a/src/finn/custom_op/fpgadataflow/rtl/pwpolyf_rtl.py +++ b/src/finn/custom_op/fpgadataflow/rtl/pwpolyf_rtl.py @@ -43,6 +43,7 @@ rtlsim_output_to_npy, ) from finn.util.pwpolyf import ( + CLAMP_CFG, NUM_OCTAVES, SUPPORTED_FUNCS, _fit_coefficients, @@ -59,36 +60,64 @@ def _float_to_hex(f): return "%08X" % struct.unpack("!I", struct.pack("!f", float(f)))[0] -def generate_coeffs_svh(K, num_samples=1000): - """Generate the pwpolyf_coeffs.svh file content for a given K value.""" +def generate_coeffs_pkg(K, num_samples=1000): + """Generate the pwpolyf_pkg.sv package content for a given K value. + + Produces a SystemVerilog package with a func_cfg_t struct per activation + function, containing clamping parameters and polynomial coefficients. + """ + degree = 2 num_subs = 1 << K num_segs = 1 + 2 * NUM_OCTAVES * num_subs lines = [] lines.append("// Auto-generated by pwpolyf_rtl.py — do not edit manually.") - lines.append("// K=%d, NUM_SEGS=%d, NUM_OCTAVES=%d, DEGREE=2" % (K, num_segs, NUM_OCTAVES)) + lines.append( + "// DEGREE=%d K=%d NUM_OCTAVES=%d Segments: %d" + % (degree, K, NUM_OCTAVES, num_segs) + ) + lines.append("") + lines.append("package pwpolyf_pkg;") + lines.append("") + lines.append(" localparam int unsigned DEGREE = %d;" % degree) + lines.append(" localparam int unsigned K = %d;" % K) + lines.append(" localparam int unsigned NUM_OCTAVES = %d;" % NUM_OCTAVES) + lines.append(" localparam int unsigned NUM_SEGS = %d;" % num_segs) lines.append("") - lines.append("localparam int unsigned PWPOLYF_K = %d;" % K) - lines.append("localparam int unsigned PWPOLYF_NUM_SEGS = %d;" % num_segs) - lines.append("localparam int unsigned PWPOLYF_NUM_OCTAVES = %d;" % NUM_OCTAVES) - lines.append("localparam int unsigned PWPOLYF_DEGREE = 2;") + lines.append(" typedef struct {") + lines.append(" logic [31:0] neg_clamp;") + lines.append(" logic [31:0] pos_clamp;") + lines.append(" bit pos_passthrough;") + lines.append(" logic [31:0] coeffs[NUM_SEGS][DEGREE+1];") + lines.append(" } func_cfg_t;") for func_name in SUPPORTED_FUNCS: + cfg = CLAMP_CFG[func_name] coeffs = _fit_coefficients(func_name, K, num_samples) - label = "PWPOLYF_%s_COEFFS" % func_name.upper() + label = func_name.upper() + neg_hex = _float_to_hex(cfg["neg_clamp"]) + pos_hex = _float_to_hex(cfg["pos_clamp"]) + passthrough = 1 if cfg["pos_passthrough"] else 0 + lines.append("") - lines.append("localparam logic [31:0] %s[%d][3] = '{" % (label, num_segs)) + lines.append(" localparam func_cfg_t %s = '{" % label) + lines.append(" neg_clamp: 32'h%s," % neg_hex) + lines.append(" pos_clamp: 32'h%s," % pos_hex) + lines.append(" pos_passthrough: %d," % passthrough) + lines.append(" coeffs: '{") for seg in range(num_segs): - c0 = _float_to_hex(coeffs[seg, 0]) - c1 = _float_to_hex(coeffs[seg, 1]) - c2 = _float_to_hex(coeffs[seg, 2]) + coeff_strs = [] + for c in range(degree + 1): + coeff_strs.append("32'h%s" % _float_to_hex(coeffs[seg, c])) comma = "," if seg < num_segs - 1 else "" lines.append( - " '{ 32'h%s, 32'h%s, 32'h%s }%s // seg %d" - % (c0, c1, c2, comma, seg) + " '{ %s }%s\t// seg %d" % (", ".join(coeff_strs), comma, seg) ) - lines.append("};") + lines.append(" }") + lines.append(" };") + lines.append("") + lines.append("endpackage") lines.append("") return "\n".join(lines) @@ -125,8 +154,8 @@ def prepare_codegen_rtl_values(self, model): def get_rtl_file_list(self): return [ + "pwpolyf_pkg.sv", "pwpolyf.sv", - "pwpolyf_coeffs.svh", "queue.sv", "pwpolyf_template_wrapper.v", ] @@ -166,11 +195,11 @@ def generate_hdl(self, model, fpgapart, clk): file_only_path = rtl_file_path.split("/")[-1] self.dump_rtl_data(code_gen_dir, file_only_path, data) - # generate coefficients .svh matching the node's K value + # generate package with coefficients matching the node's K value K = self.get_nodeattr("K") - svh_data = generate_coeffs_svh(K) - with open(os.path.join(code_gen_dir, "pwpolyf_coeffs.svh"), "w") as f: - f.write(svh_data) + pkg_data = generate_coeffs_pkg(K) + with open(os.path.join(code_gen_dir, "pwpolyf_pkg.sv"), "w") as f: + f.write(pkg_data) self.set_nodeattr("ipgen_path", code_gen_dir) self.set_nodeattr("ip_path", code_gen_dir) @@ -181,11 +210,9 @@ def prepare_rtlsim(self): code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") verilog_paths = [code_gen_dir] - # exclude .svh — it is pulled in via `include from pwpolyf.sv verilog_files = [ x.replace("pwpolyf_template_wrapper", self.get_nodeattr("gen_top_module")) for x in self.get_rtl_file_list() - if not x.endswith(".svh") ] single_src_dir = make_build_dir("pyverilator_" + self.onnx_node.name + "_") diff --git a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py index 2427a4514a..c97267bb54 100644 --- a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py +++ b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py @@ -1,4 +1,4 @@ -# Copyright (C) 2023-2024, Advanced Micro Devices, Inc. +# Copyright (C) 2023-2026, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -275,8 +275,7 @@ def apply(self, model): class InferPWPolyFLayer(Transformation): - """Convert PWPolyF custom ops and standard ONNX activations (Gelu, Sigmoid, - Tanh, SiLU pattern) into piecewise polynomial HW layers.""" + """Convert activations to piecewise polynomial HW layers.""" _SINGLE_OP_MAP = {"Gelu": "gelu", "Tanh": "tanh"} @@ -292,18 +291,12 @@ def _is_const_scalar(model, tensor_name, value, tol=1e-3): return init.size == 1 and abs(float(init.flat[0]) - value) < tol def _match_erf_gelu(self, model, erf_node): - """Try to match the Erf-based GELU decomposition rooted at *erf_node*. - - Pattern (opset < 20): - Div(x, sqrt(2)) → Erf → Add(_, 1) → Mul(0.5, _) → Mul(x, _) - - Returns (pwp_input, pwp_output, nodes_to_remove) on success, else None. - """ - # --- backward: Erf input must come from Div(x, sqrt(2)) --- + """Match Erf-based GELU: Div(x,sqrt(2))→Erf→Add(_,1)→Mul(0.5,_)→Mul(x,_). + Returns (pwp_input, pwp_output, nodes_to_remove) or None.""" + # backward: Erf input must come from Div(x, sqrt(2)) div_node = model.find_producer(erf_node.input[0]) if div_node is None or div_node.op_type != "Div": return None - # one Div input is x, the other is sqrt(2) ≈ 1.4142 if self._is_const_scalar(model, div_node.input[1], 1.4142135): gelu_input = div_node.input[0] elif self._is_const_scalar(model, div_node.input[0], 1.4142135): @@ -311,7 +304,7 @@ def _match_erf_gelu(self, model, erf_node): else: return None - # --- forward: Erf → Add(_, 1) --- + # forward: Erf → Add(_, 1) erf_consumers = model.find_consumers(erf_node.output[0]) if len(erf_consumers) != 1 or erf_consumers[0].op_type != "Add": return None @@ -320,7 +313,7 @@ def _match_erf_gelu(self, model, erf_node): if len(other_add) != 1 or not self._is_const_scalar(model, other_add[0], 1.0): return None - # --- Add → Mul(0.5, _) --- + # Add → Mul(0.5, _) add_consumers = model.find_consumers(add_node.output[0]) if len(add_consumers) != 1 or add_consumers[0].op_type != "Mul": return None @@ -329,7 +322,7 @@ def _match_erf_gelu(self, model, erf_node): if len(other_mul_half) != 1 or not self._is_const_scalar(model, other_mul_half[0], 0.5): return None - # --- Mul(0.5,_) → Mul(x, _) --- + # Mul(0.5,_) → Mul(x, _) half_consumers = model.find_consumers(mul_half.output[0]) if len(half_consumers) != 1 or half_consumers[0].op_type != "Mul": return None From 1f6c5eb1dcf6ee5543effbc9f6d5456f246d906a Mon Sep 17 00:00:00 2001 From: ollycassidy13 Date: Fri, 24 Apr 2026 14:27:14 +0100 Subject: [PATCH 04/12] pkg changes --- finn-rtllib/pwpolyf/hdl/pwpolyf_pkg.sv | 14 ++++++-------- src/finn/custom_op/fpgadataflow/rtl/pwpolyf_rtl.py | 6 +++--- 2 files changed, 9 insertions(+), 11 deletions(-) diff --git a/finn-rtllib/pwpolyf/hdl/pwpolyf_pkg.sv b/finn-rtllib/pwpolyf/hdl/pwpolyf_pkg.sv index 2838f03fe2..fc2d3ace3b 100644 --- a/finn-rtllib/pwpolyf/hdl/pwpolyf_pkg.sv +++ b/finn-rtllib/pwpolyf/hdl/pwpolyf_pkg.sv @@ -1,7 +1,5 @@ /**************************************************************************** - * Copyright (C) 2026, Advanced Micro Devices, Inc. - * All rights reserved. - * + * Copyright Advanced Micro Devices, Inc. * SPDX-License-Identifier: BSD-3-Clause * * @brief Per-activation configuration for pwpolyf. @@ -12,7 +10,7 @@ * configuration (clamping parameters + coefficient arrays) for the * piecewise polynomial activation unit. * - * Coefficient data auto-generated by pwpolyf_coeffs.py -- DEGREE=2 K=3 + * Coefficient data auto-generated by pwpolyf_rtl.py -- DEGREE=2 K=3 * NUM_OCTAVES=5. Segments: 81 Coefficients per segment: 3 * Polynomial: y = a_0 + a_1*x + a_2*x^2 + ... + a_d*x^d * Horner form: y = a_0 + x*(a_1 + x*(a_2 + ... x*a_d)) @@ -30,10 +28,10 @@ package pwpolyf_pkg; localparam int unsigned NUM_SEGS = 81; typedef struct { - logic [31:0] neg_clamp; - logic [31:0] pos_clamp; - bit pos_passthrough; - logic [31:0] coeffs[NUM_SEGS][DEGREE+1]; + int unsigned neg_clamp; + int unsigned pos_clamp; + bit pos_passthrough; + int unsigned coeffs[NUM_SEGS][DEGREE+1]; } func_cfg_t; localparam func_cfg_t GELU = '{ diff --git a/src/finn/custom_op/fpgadataflow/rtl/pwpolyf_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/pwpolyf_rtl.py index dccc9e8240..8045a94dbc 100644 --- a/src/finn/custom_op/fpgadataflow/rtl/pwpolyf_rtl.py +++ b/src/finn/custom_op/fpgadataflow/rtl/pwpolyf_rtl.py @@ -85,10 +85,10 @@ def generate_coeffs_pkg(K, num_samples=1000): lines.append(" localparam int unsigned NUM_SEGS = %d;" % num_segs) lines.append("") lines.append(" typedef struct {") - lines.append(" logic [31:0] neg_clamp;") - lines.append(" logic [31:0] pos_clamp;") + lines.append(" int unsigned neg_clamp;") + lines.append(" int unsigned pos_clamp;") lines.append(" bit pos_passthrough;") - lines.append(" logic [31:0] coeffs[NUM_SEGS][DEGREE+1];") + lines.append(" int unsigned coeffs[NUM_SEGS][DEGREE+1];") lines.append(" } func_cfg_t;") for func_name in SUPPORTED_FUNCS: From 6d01b10309f89f8264ef3b8dcdee4f581dd38baa Mon Sep 17 00:00:00 2001 From: ollycassidy13 Date: Fri, 24 Apr 2026 16:03:23 +0100 Subject: [PATCH 05/12] linting --- finn-rtllib/pwpolyf/hdl/pwpolyf.abc | 2 +- finn-rtllib/pwpolyf/hdl/pwpolyf.sv | 2 +- finn-rtllib/pwpolyf/hdl/pwpolyf_pkg.sv | 2 +- finn-rtllib/pwpolyf/hdl/pwpolyf_tb.sv | 2 +- src/finn/custom_op/fpgadataflow/pwpolyf.py | 5 +- .../custom_op/fpgadataflow/rtl/__init__.py | 2 +- .../custom_op/fpgadataflow/rtl/pwpolyf_rtl.py | 18 +--- .../fpgadataflow/convert_to_hw_layers.py | 83 +++++++++---------- src/finn/util/pwpolyf.py | 22 ++--- .../fpgadataflow/test_fpgadataflow_pwpolyf.py | 77 ++++++++++------- 10 files changed, 111 insertions(+), 104 deletions(-) diff --git a/finn-rtllib/pwpolyf/hdl/pwpolyf.abc b/finn-rtllib/pwpolyf/hdl/pwpolyf.abc index 06b77b967d..c25b5fda3d 100644 --- a/finn-rtllib/pwpolyf/hdl/pwpolyf.abc +++ b/finn-rtllib/pwpolyf/hdl/pwpolyf.abc @@ -2,4 +2,4 @@ import queue read_sv pwpolyf_pkg.sv read_sv pwpolyf.sv setup_tb pwpolyf_tb -setup_top pwpolyf \ No newline at end of file +setup_top pwpolyf diff --git a/finn-rtllib/pwpolyf/hdl/pwpolyf.sv b/finn-rtllib/pwpolyf/hdl/pwpolyf.sv index a2257fe17f..32c0b5ea6b 100644 --- a/finn-rtllib/pwpolyf/hdl/pwpolyf.sv +++ b/finn-rtllib/pwpolyf/hdl/pwpolyf.sv @@ -323,4 +323,4 @@ module pwpolyf #( end end -endmodule : pwpolyf \ No newline at end of file +endmodule : pwpolyf diff --git a/finn-rtllib/pwpolyf/hdl/pwpolyf_pkg.sv b/finn-rtllib/pwpolyf/hdl/pwpolyf_pkg.sv index fc2d3ace3b..cdf479355e 100644 --- a/finn-rtllib/pwpolyf/hdl/pwpolyf_pkg.sv +++ b/finn-rtllib/pwpolyf/hdl/pwpolyf_pkg.sv @@ -390,4 +390,4 @@ package pwpolyf_pkg; } }; -endpackage \ No newline at end of file +endpackage diff --git a/finn-rtllib/pwpolyf/hdl/pwpolyf_tb.sv b/finn-rtllib/pwpolyf/hdl/pwpolyf_tb.sv index 574b073b3e..f98929e2ab 100644 --- a/finn-rtllib/pwpolyf/hdl/pwpolyf_tb.sv +++ b/finn-rtllib/pwpolyf/hdl/pwpolyf_tb.sv @@ -142,4 +142,4 @@ module pwpolyf_tb; end : genTests -endmodule : pwpolyf_tb \ No newline at end of file +endmodule : pwpolyf_tb diff --git a/src/finn/custom_op/fpgadataflow/pwpolyf.py b/src/finn/custom_op/fpgadataflow/pwpolyf.py index e05ba9c2aa..d412e1669a 100644 --- a/src/finn/custom_op/fpgadataflow/pwpolyf.py +++ b/src/finn/custom_op/fpgadataflow/pwpolyf.py @@ -177,8 +177,9 @@ def execute_node(self, context, graph): K = self.get_nodeattr("K") # lazy import to avoid hard dependency on torch at module level - import torch - from finn.util.pwpolyf import PiecewisePolyActivation + import torch # noqa: PLC0415 + + from finn.util.pwpolyf import PiecewisePolyActivation # noqa: PLC0415 mod = PiecewisePolyActivation(func, K=K) with torch.no_grad(): diff --git a/src/finn/custom_op/fpgadataflow/rtl/__init__.py b/src/finn/custom_op/fpgadataflow/rtl/__init__.py index 77c1cb374d..053b8e8f02 100644 --- a/src/finn/custom_op/fpgadataflow/rtl/__init__.py +++ b/src/finn/custom_op/fpgadataflow/rtl/__init__.py @@ -39,12 +39,12 @@ from finn.custom_op.fpgadataflow.rtl.inner_shuffle_rtl import InnerShuffle_rtl from finn.custom_op.fpgadataflow.rtl.layernorm_rtl import LayerNorm_rtl from finn.custom_op.fpgadataflow.rtl.matrixvectoractivation_rtl import MVAU_rtl +from finn.custom_op.fpgadataflow.rtl.pwpolyf_rtl import PWPolyF_rtl from finn.custom_op.fpgadataflow.rtl.requant_rtl import Requant_rtl from finn.custom_op.fpgadataflow.rtl.streamingdatawidthconverter_rtl import ( StreamingDataWidthConverter_rtl, ) from finn.custom_op.fpgadataflow.rtl.streamingfifo_rtl import StreamingFIFO_rtl -from finn.custom_op.fpgadataflow.rtl.pwpolyf_rtl import PWPolyF_rtl from finn.custom_op.fpgadataflow.rtl.thresholding_rtl import Thresholding_rtl from finn.custom_op.fpgadataflow.rtl.vectorvectoractivation_rtl import VVAU_rtl diff --git a/src/finn/custom_op/fpgadataflow/rtl/pwpolyf_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/pwpolyf_rtl.py index 2935b37912..6bd80dd0df 100644 --- a/src/finn/custom_op/fpgadataflow/rtl/pwpolyf_rtl.py +++ b/src/finn/custom_op/fpgadataflow/rtl/pwpolyf_rtl.py @@ -32,12 +32,7 @@ from finn.custom_op.fpgadataflow.pwpolyf import PWPolyF from finn.custom_op.fpgadataflow.rtlbackend import RTLBackend -from finn.util.pwpolyf import ( - CLAMP_CFG, - NUM_OCTAVES, - SUPPORTED_FUNCS, - _fit_coefficients, -) +from finn.util.pwpolyf import CLAMP_CFG, NUM_OCTAVES, SUPPORTED_FUNCS, _fit_coefficients def _float_to_hex(f): @@ -58,8 +53,7 @@ def generate_coeffs_pkg(K, num_samples=1000): lines = [] lines.append("// Auto-generated by pwpolyf_rtl.py — do not edit manually.") lines.append( - "// DEGREE=%d K=%d NUM_OCTAVES=%d Segments: %d" - % (degree, K, NUM_OCTAVES, num_segs) + "// DEGREE=%d K=%d NUM_OCTAVES=%d Segments: %d" % (degree, K, NUM_OCTAVES, num_segs) ) lines.append("") lines.append("package pwpolyf_pkg;") @@ -95,9 +89,7 @@ def generate_coeffs_pkg(K, num_samples=1000): for c in range(degree + 1): coeff_strs.append("32'h%s" % _float_to_hex(coeffs[seg, c])) comma = "," if seg < num_segs - 1 else "" - lines.append( - " '{ %s }%s\t// seg %d" % (", ".join(coeff_strs), comma, seg) - ) + lines.append(" '{ %s }%s\t// seg %d" % (", ".join(coeff_strs), comma, seg)) lines.append(" }") lines.append(" };") @@ -163,9 +155,7 @@ def generate_hdl(self, model, fpgapart, clk): def get_rtl_file_list(self, abspath=False): if abspath: code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + "/" - rtllib_dir = os.path.join( - os.environ["FINN_ROOT"], "finn-rtllib/pwpolyf/hdl/" - ) + rtllib_dir = os.path.join(os.environ["FINN_ROOT"], "finn-rtllib/pwpolyf/hdl/") else: code_gen_dir = "" rtllib_dir = "" diff --git a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py index 9b78cb6e40..73f2dadb2d 100644 --- a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py +++ b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py @@ -354,8 +354,13 @@ def apply(self, model): K = K_attr.i if K_attr is not None else 3 new_node = self._make_pwpolyf_node( - pwp_input, pwp_output, func, pwp_in_shape, idt, - "PWPolyF_" + node.name, K, + pwp_input, + pwp_output, + func, + pwp_in_shape, + idt, + "PWPolyF_" + node.name, + K, ) graph.node.insert(node_ind, new_node) graph.node.remove(node) @@ -374,7 +379,11 @@ def apply(self, model): func = self._SINGLE_OP_MAP[node.op_type] new_node = self._make_pwpolyf_node( - pwp_input, pwp_output, func, pwp_in_shape, idt, + pwp_input, + pwp_output, + func, + pwp_in_shape, + idt, "PWPolyF_" + node.name, ) graph.node.insert(node_ind, new_node) @@ -410,7 +419,11 @@ def apply(self, model): nodes_to_remove.append(mul_cand) new_node = self._make_pwpolyf_node( - sig_input, pwp_output, func, pwp_in_shape, idt, + sig_input, + pwp_output, + func, + pwp_in_shape, + idt, "PWPolyF_" + node.name, ) graph.node.insert(node_ind, new_node) @@ -433,7 +446,11 @@ def apply(self, model): continue new_node = self._make_pwpolyf_node( - pwp_input, pwp_output, "gelu", pwp_in_shape, idt, + pwp_input, + pwp_output, + "gelu", + pwp_in_shape, + idt, "PWPolyF_" + node.name, ) graph.node.insert(node_ind, new_node) @@ -1568,13 +1585,11 @@ def apply(self, model): mm_in_shape = model.get_tensor_shape(mm_input) mm_out_shape = model.get_tensor_shape(mm_output) assert model.get_tensor_datatype(mm_input) == DataType["BINARY"], ( - n.name - + """: First + n.name + """: First input for xnorpopcount is not Wset to FINN DataType BINARY.""" ) assert model.get_tensor_datatype(mm_weight) == DataType["BINARY"], ( - n.name - + """: Second + n.name + """: Second input (weights) for xnorpopcount is not set to FINN DataType BINARY.""" ) idt = DataType["BINARY"] @@ -1592,8 +1607,7 @@ def apply(self, model): simd = 1 wmem = mw * mh // (pe * simd) assert mw * mh == wmem * pe * simd, ( - n.name - + """: Requirement (MW * MH) divisiable by + n.name + """: Requirement (MW * MH) divisiable by (WMEM * PE * SIMD) is violated.""" ) # see if we have any following thresholds @@ -1606,8 +1620,7 @@ def apply(self, model): mt_thres = consumer.input[1] T = model.get_initializer(mt_thres) assert T.shape[0] == 1 or T.shape[0] == mh, ( - consumer.name - + """: First dimension of + consumer.name + """: First dimension of thresholds neither 1 nor MH.""" ) odt = model.get_tensor_datatype(mt_output) @@ -1719,8 +1732,7 @@ def apply(self, model): simd = 1 wmem = mw * mh // (pe * simd) assert mw * mh == wmem * pe * simd, ( - n.name - + """: Requirement (MW * MH) divisible by + n.name + """: Requirement (MW * MH) divisible by (WMEM * PE * SIMD) is violated.""" ) # see if we have any following thresholds @@ -1733,8 +1745,7 @@ def apply(self, model): mt_thres = consumer.input[1] T = model.get_initializer(mt_thres) assert T.shape[0] == 1 or T.shape[0] == mh, ( - consumer.name - + """: First dimension of + consumer.name + """: First dimension of thresholds neither 1 nor MH.""" ) odt = model.get_tensor_datatype(mt_output) @@ -1843,11 +1854,8 @@ def apply(self, model): try: k_h, k_w = sparsity["dw"]["kernel_shape"] except KeyError: - raise Exception( - n.name - + """: sparsity annotation doesn't indicate that MatMul - belongs to a depthwise convolution.""" - ) + raise Exception(n.name + """: sparsity annotation doesn't indicate that MatMul + belongs to a depthwise convolution.""") mm_input = n.input[0] mm_weight = n.input[1] @@ -1890,8 +1898,7 @@ def apply(self, model): mt_thres = consumer.input[1] T = model.get_initializer(mt_thres) assert T.shape[0] == 1 or T.shape[0] == channels, ( - consumer.name - + """: First dimension of + consumer.name + """: First dimension of thresholds neither 1 nor Channels.""" ) odt = model.get_tensor_datatype(mt_output) @@ -2076,9 +2083,7 @@ def apply(self, model): to_remove.append(consumer) # Handle None shapes (shape inference might have failed) - assert ( - in_reshaped is not None - ), f"""Could not infer shape for tensor {n.input[0]}. + assert in_reshaped is not None, f"""Could not infer shape for tensor {n.input[0]}. Please run InferShapes first""" assert ( out_reshaped is not None @@ -2090,28 +2095,22 @@ def apply(self, model): # Some sanity checks for the transformation if idt != odt: - raise RuntimeError( - """ + raise RuntimeError(""" Input datatype and output datatype of the shuffle must be the same, did something go wrong during transformation? - """ - ) + """) if len(perm.ints) != len(in_reshaped): - raise RuntimeError( - f""" + raise RuntimeError(f""" Permutation list {perm.ints=} does not match the reshaped input dimension {in_reshaped=} - """ - ) + """) if len(perm.ints) != len(out_shape): - raise RuntimeError( - f""" + raise RuntimeError(f""" Permutation list {perm.ints=} does not match the reshaped out dimension {out_reshaped=} - """ - ) + """) simd = 1 @@ -2417,10 +2416,8 @@ def apply(self, model): scale_is_one = (scale == 1).all() bias_is_zero = not np.any(bias) if not (scale_is_one and (bias_is_zero or bias is not None)): - warnings.warn( - f"""{node.name}: Scale is not one or bias is not zero. - Can't be converted to HWCustomOp. Please run ExtractNormScaleBias first.""" - ) + warnings.warn(f"""{node.name}: Scale is not one or bias is not zero. + Can't be converted to HWCustomOp. Please run ExtractNormScaleBias first.""") continue act_in = node.input[0] act_out = node.output[0] diff --git a/src/finn/util/pwpolyf.py b/src/finn/util/pwpolyf.py index 484cfde85c..da3f65e246 100644 --- a/src/finn/util/pwpolyf.py +++ b/src/finn/util/pwpolyf.py @@ -55,10 +55,10 @@ } CLAMP_CFG = { - "gelu": {"neg_clamp": 0.0, "pos_clamp": 0.0, "pos_passthrough": True}, - "silu": {"neg_clamp": 0.0, "pos_clamp": 0.0, "pos_passthrough": True}, + "gelu": {"neg_clamp": 0.0, "pos_clamp": 0.0, "pos_passthrough": True}, + "silu": {"neg_clamp": 0.0, "pos_clamp": 0.0, "pos_passthrough": True}, "sigmoid": {"neg_clamp": 0.0, "pos_clamp": 1.0, "pos_passthrough": False}, - "tanh": {"neg_clamp": -1.0, "pos_clamp": 1.0, "pos_passthrough": False}, + "tanh": {"neg_clamp": -1.0, "pos_clamp": 1.0, "pos_passthrough": False}, } @@ -73,7 +73,7 @@ def _segment_boundaries(K): # Positive segments for octave in range(NUM_OCTAVES): exp_val = EXP_BASE + octave - EXP_BIAS - base = 2.0 ** exp_val + base = 2.0**exp_val for sub in range(num_subs): lo = base * (1.0 + sub / num_subs) hi = base * (1.0 + (sub + 1) / num_subs) @@ -82,7 +82,7 @@ def _segment_boundaries(K): # Negative segments (mirror of positive) for octave in range(NUM_OCTAVES): exp_val = EXP_BASE + octave - EXP_BIAS - base = 2.0 ** exp_val + base = 2.0**exp_val for sub in range(num_subs): lo = base * (1.0 + sub / num_subs) hi = base * (1.0 + (sub + 1) / num_subs) @@ -151,9 +151,7 @@ def forward(ctx, x, coeffs, neg_clamp_val, pos_clamp_val, func, K): orig_shape = x.shape x_flat = x.contiguous().view(-1) - seg_idx, is_neg_clamp, is_pos_clamp = _segment_index( - x_flat, K, num_subs, num_segs - ) + seg_idx, is_neg_clamp, is_pos_clamp = _segment_index(x_flat, K, num_subs, num_segs) c = coeffs[seg_idx] a0 = c[:, 0] @@ -207,8 +205,12 @@ def __init__(self, func="gelu", K=3, fit_samples=1000): def forward(self, x): if torch.onnx.is_in_onnx_export(): return PWPolyFFunction.apply( - x, self.coeffs, self.neg_clamp_val, self.pos_clamp_val, - self.func, self.K, + x, + self.coeffs, + self.neg_clamp_val, + self.pos_clamp_val, + self.func, + self.K, ) orig_shape = x.shape diff --git a/tests/fpgadataflow/test_fpgadataflow_pwpolyf.py b/tests/fpgadataflow/test_fpgadataflow_pwpolyf.py index e491d82eba..a36117b90d 100644 --- a/tests/fpgadataflow/test_fpgadataflow_pwpolyf.py +++ b/tests/fpgadataflow/test_fpgadataflow_pwpolyf.py @@ -41,20 +41,16 @@ import finn.core.onnx_exec as oxe from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer -from finn.util.pwpolyf import PiecewisePolyActivation from finn.transformation.fpgadataflow.convert_to_hw_layers import InferPWPolyFLayer from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers +from finn.util.pwpolyf import PiecewisePolyActivation test_fpga_part = "xczu3eg-sbva484-1-e" def make_pwpolyf_modelwrapper(func, K, num_channels, num_input_vecs): - inp = helper.make_tensor_value_info( - "inp", TensorProto.FLOAT, num_input_vecs + [num_channels] - ) - outp = helper.make_tensor_value_info( - "outp", TensorProto.FLOAT, num_input_vecs + [num_channels] - ) + inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, num_input_vecs + [num_channels]) + outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, num_input_vecs + [num_channels]) pwpolyf_node = helper.make_node( "PWPolyF", @@ -131,11 +127,16 @@ def test_pwpolyf_onnx_export(func): tmpf = f.name try: torch.onnx.export( - mod, dummy, tmpf, - input_names=["input"], output_names=["output"], - opset_version=13, dynamo=False, + mod, + dummy, + tmpf, + input_names=["input"], + output_names=["output"], + opset_version=13, + dynamo=False, ) - import onnx + import onnx # noqa: PLC0415 + onnx_model = onnx.load(tmpf) finally: os.unlink(tmpf) @@ -161,9 +162,13 @@ def test_pwpolyf_infer_transform(func): tmpf = f.name try: torch.onnx.export( - mod, dummy, tmpf, - input_names=["inp"], output_names=["outp"], - opset_version=13, dynamo=False, + mod, + dummy, + tmpf, + input_names=["inp"], + output_names=["outp"], + opset_version=13, + dynamo=False, ) model = ModelWrapper(tmpf) finally: @@ -311,7 +316,10 @@ def make_silu_pattern_model(num_channels, num_input_vecs): mul_node = helper.make_node("Mul", ["inp", "sig_out"], ["outp"], name="Mul_0") graph = helper.make_graph( - [sigmoid_node, mul_node], "silu_graph", [inp], [outp], + [sigmoid_node, mul_node], + "silu_graph", + [inp], + [outp], ) model = helper.make_model(graph, producer_name="test") model = ModelWrapper(model) @@ -343,7 +351,9 @@ def make_erf_gelu_model(num_channels, num_input_vecs): graph = helper.make_graph( [div_node, erf_node, add_node, mul_half_node, mul_x_node], - "erf_gelu_graph", [inp], [outp], + "erf_gelu_graph", + [inp], + [outp], initializer=[sqrt2, one, half], ) model = helper.make_model(graph, producer_name="test") @@ -356,16 +366,18 @@ def make_erf_gelu_model(num_channels, num_input_vecs): # ---------- standard ONNX op inference tests ---------- -@pytest.mark.parametrize("op_type,expected_func", [ - ("Gelu", "gelu"), - ("Sigmoid", "sigmoid"), - ("Tanh", "tanh"), -]) +@pytest.mark.parametrize( + "op_type,expected_func", + [ + ("Gelu", "gelu"), + ("Sigmoid", "sigmoid"), + ("Tanh", "tanh"), + ], +) @pytest.mark.parametrize("num_channels", [4, 16]) @pytest.mark.parametrize("num_input_vecs", [[1], [1, 2, 2]]) @pytest.mark.fpgadataflow -def test_pwpolyf_infer_standard_op(op_type, expected_func, - num_channels, num_input_vecs): +def test_pwpolyf_infer_standard_op(op_type, expected_func, num_channels, num_input_vecs): model = make_standard_activation_model(op_type, num_channels, num_input_vecs) assert model.graph.node[0].op_type == op_type @@ -449,8 +461,10 @@ def test_pwpolyf_sigmoid_multi_consumer_no_silu(): identity_node = helper.make_node("Identity", ["sig_out"], ["outp2"], name="Id_0") graph = helper.make_graph( - [sigmoid_node, mul_node, identity_node], "test_graph", - [inp], [outp1, outp2], + [sigmoid_node, mul_node, identity_node], + "test_graph", + [inp], + [outp1, outp2], ) model = helper.make_model(graph, producer_name="test") model = ModelWrapper(model) @@ -469,11 +483,14 @@ def test_pwpolyf_sigmoid_multi_consumer_no_silu(): assert any(n.op_type == "Identity" for n in model.graph.node) -@pytest.mark.parametrize("op_type,expected_func", [ - ("Gelu", "gelu"), - ("Sigmoid", "sigmoid"), - ("Tanh", "tanh"), -]) +@pytest.mark.parametrize( + "op_type,expected_func", + [ + ("Gelu", "gelu"), + ("Sigmoid", "sigmoid"), + ("Tanh", "tanh"), + ], +) @pytest.mark.fpgadataflow def test_pwpolyf_standard_op_execution(op_type, expected_func): num_channels = 16 From adc14f2f706a74921d30023928c04a4078395b5a Mon Sep 17 00:00:00 2001 From: ollycassidy13 Date: Fri, 24 Apr 2026 16:29:52 +0100 Subject: [PATCH 06/12] linting --- .../fpgadataflow/convert_to_hw_layers.py | 56 +++++++++++++------ 1 file changed, 38 insertions(+), 18 deletions(-) diff --git a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py index 73f2dadb2d..3f714d7ae7 100644 --- a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py +++ b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py @@ -1585,11 +1585,13 @@ def apply(self, model): mm_in_shape = model.get_tensor_shape(mm_input) mm_out_shape = model.get_tensor_shape(mm_output) assert model.get_tensor_datatype(mm_input) == DataType["BINARY"], ( - n.name + """: First + n.name + + """: First input for xnorpopcount is not Wset to FINN DataType BINARY.""" ) assert model.get_tensor_datatype(mm_weight) == DataType["BINARY"], ( - n.name + """: Second + n.name + + """: Second input (weights) for xnorpopcount is not set to FINN DataType BINARY.""" ) idt = DataType["BINARY"] @@ -1607,7 +1609,8 @@ def apply(self, model): simd = 1 wmem = mw * mh // (pe * simd) assert mw * mh == wmem * pe * simd, ( - n.name + """: Requirement (MW * MH) divisiable by + n.name + + """: Requirement (MW * MH) divisiable by (WMEM * PE * SIMD) is violated.""" ) # see if we have any following thresholds @@ -1620,7 +1623,8 @@ def apply(self, model): mt_thres = consumer.input[1] T = model.get_initializer(mt_thres) assert T.shape[0] == 1 or T.shape[0] == mh, ( - consumer.name + """: First dimension of + consumer.name + + """: First dimension of thresholds neither 1 nor MH.""" ) odt = model.get_tensor_datatype(mt_output) @@ -1732,7 +1736,8 @@ def apply(self, model): simd = 1 wmem = mw * mh // (pe * simd) assert mw * mh == wmem * pe * simd, ( - n.name + """: Requirement (MW * MH) divisible by + n.name + + """: Requirement (MW * MH) divisible by (WMEM * PE * SIMD) is violated.""" ) # see if we have any following thresholds @@ -1745,7 +1750,8 @@ def apply(self, model): mt_thres = consumer.input[1] T = model.get_initializer(mt_thres) assert T.shape[0] == 1 or T.shape[0] == mh, ( - consumer.name + """: First dimension of + consumer.name + + """: First dimension of thresholds neither 1 nor MH.""" ) odt = model.get_tensor_datatype(mt_output) @@ -1854,8 +1860,11 @@ def apply(self, model): try: k_h, k_w = sparsity["dw"]["kernel_shape"] except KeyError: - raise Exception(n.name + """: sparsity annotation doesn't indicate that MatMul - belongs to a depthwise convolution.""") + raise Exception( + n.name + + """: sparsity annotation doesn't indicate that MatMul + belongs to a depthwise convolution.""" + ) mm_input = n.input[0] mm_weight = n.input[1] @@ -1898,7 +1907,8 @@ def apply(self, model): mt_thres = consumer.input[1] T = model.get_initializer(mt_thres) assert T.shape[0] == 1 or T.shape[0] == channels, ( - consumer.name + """: First dimension of + consumer.name + + """: First dimension of thresholds neither 1 nor Channels.""" ) odt = model.get_tensor_datatype(mt_output) @@ -2083,7 +2093,9 @@ def apply(self, model): to_remove.append(consumer) # Handle None shapes (shape inference might have failed) - assert in_reshaped is not None, f"""Could not infer shape for tensor {n.input[0]}. + assert ( + in_reshaped is not None + ), f"""Could not infer shape for tensor {n.input[0]}. Please run InferShapes first""" assert ( out_reshaped is not None @@ -2095,22 +2107,28 @@ def apply(self, model): # Some sanity checks for the transformation if idt != odt: - raise RuntimeError(""" + raise RuntimeError( + """ Input datatype and output datatype of the shuffle must be the same, did something go wrong during transformation? - """) + """ + ) if len(perm.ints) != len(in_reshaped): - raise RuntimeError(f""" + raise RuntimeError( + f""" Permutation list {perm.ints=} does not match the reshaped input dimension {in_reshaped=} - """) + """ + ) if len(perm.ints) != len(out_shape): - raise RuntimeError(f""" + raise RuntimeError( + f""" Permutation list {perm.ints=} does not match the reshaped out dimension {out_reshaped=} - """) + """ + ) simd = 1 @@ -2416,8 +2434,10 @@ def apply(self, model): scale_is_one = (scale == 1).all() bias_is_zero = not np.any(bias) if not (scale_is_one and (bias_is_zero or bias is not None)): - warnings.warn(f"""{node.name}: Scale is not one or bias is not zero. - Can't be converted to HWCustomOp. Please run ExtractNormScaleBias first.""") + warnings.warn( + f"""{node.name}: Scale is not one or bias is not zero. + Can't be converted to HWCustomOp. Please run ExtractNormScaleBias first.""" + ) continue act_in = node.input[0] act_out = node.output[0] From dd1e700599a8516b6946234d5c6a59dcfd9a70a4 Mon Sep 17 00:00:00 2001 From: ollycassidy13 Date: Mon, 27 Apr 2026 17:35:17 +0100 Subject: [PATCH 07/12] improved testing --- docs/finn/pwpolyf.md | 35 ++-- .../pwpolyf/hdl/pwpolyf_template_wrapper.v | 14 +- finn_xsi/finn_xsi/adapter.py | 2 +- src/finn/custom_op/fpgadataflow/pwpolyf.py | 12 +- .../custom_op/fpgadataflow/rtl/pwpolyf_rtl.py | 14 +- .../fpgadataflow/convert_to_hw_layers.py | 5 +- src/finn/util/pwpolyf.py | 35 ++-- .../fpgadataflow/test_fpgadataflow_pwpolyf.py | 187 +++++++++++++++++- 8 files changed, 249 insertions(+), 55 deletions(-) diff --git a/docs/finn/pwpolyf.md b/docs/finn/pwpolyf.md index cd8510a7ef..11b86e88a4 100644 --- a/docs/finn/pwpolyf.md +++ b/docs/finn/pwpolyf.md @@ -16,11 +16,12 @@ K=3 this gives 81 segments. Segment selection reuses the FP32 exponent/mantissa bit-fields directly, matching the RTL implementation. Polynomial coefficients are generated at HDL build time by -`generate_coeffs_pkg()` in `pwpolyf_rtl.py`, which fits degree-2 polynomials -to the reference PyTorch functions and writes `pwpolyf_pkg.sv` — a -SystemVerilog package with one `func_cfg_t` struct per activation -(clamping config + coefficient table). K can take any value; it defaults -to 3 when inferred from standard ONNX ops. +`generate_coeffs_pkg()` in `pwpolyf_rtl.py`, which fits polynomials of the +configured degree to the reference PyTorch functions and writes +`pwpolyf_pkg.sv` — a SystemVerilog package with one `func_cfg_t` struct per +activation (clamping config + coefficient table). Both K and degree are +configurable; they default to K=3 and degree=2 when inferred from standard +ONNX ops. ## Architecture @@ -74,18 +75,19 @@ Notes: ## Folding PWPolyF uses PE parallelism. `NumChannels % PE == 0` must hold. -Each PE instantiates its own polynomial evaluation pipeline (2 DSPs). +Each PE instantiates its own polynomial evaluation pipeline (`degree` DSPs). `SetFolding` handles PE selection automatically. -| PE | DSPs | Approx LUTs | Cycles (per spatial position) | -|----|------|-------------|-------------------------------| -| 1 | 2 | 200 | NumChannels | -| C | 2C | 200C | 1 | +| PE | Degree | DSPs | Approx LUTs | Cycles (per spatial position) | +|----|--------|------------|-------------------|-------------------------------| +| 1 | 2 | 2 | 200 | NumChannels | +| C | 2 | 2C | 200C | 1 | +| 1 | 3 | 3 | 300 | NumChannels | ## Resource estimates -- **DSP:** 2 per PE (two FP32 FMA stages) -- **LUT:** ~200 per PE (segment address decode + control) +- **DSP:** `degree * PE` (one FP32 FMA stage per polynomial degree per PE) +- **LUT:** `~100 * degree * PE` (segment address decode + control) - **BRAM/URAM:** 0 (coefficients stored in LUT/registers) ## ONNX export @@ -109,7 +111,8 @@ Attributes on the explicit PWPolyF ONNX node: | Attribute | Type | Description | |--------------------|--------|------------------------------------------| | `func` | string | Activation function name | -| `K` | int | Mantissa subdivision bits | +| `K` | int | Mantissa subdivision bits (default 3) | +| `degree` | int | Polynomial degree / FMA stages (default 2) | | `NumChannels` | int | Number of channels (last input dim) | | `PE` | int | Processing elements | | `inputDataType` | string | Input data type (FLOAT32) | @@ -159,6 +162,10 @@ Attributes on the explicit PWPolyF ONNX node: - **SiLU edge cases**: reversed Mul input order, multi-consumer Sigmoid - **Execution correctness**: standard ops produce same output as PiecewisePolyActivation - **SpecializeLayers**: verifies RTL specialization -- **Resource estimates**: DSP/LUT/BRAM checks across PE values +- **Resource estimates**: DSP/LUT/BRAM checks across PE and degree values - **Folded shapes**: input/output/stream width calculations - **Expected cycles**: cycle count estimation + analysis pass integration +- **Coefficient package**: `generate_coeffs_pkg()` output validation for K and degree +- **HDL generation** (Vivado): verifies `generate_hdl` produces correct files and package content +- **RTL simulation** (Vivado, slow): node-by-node rtlsim with cycle count verification +- **Stitched IP** (Vivado, slow): end-to-end stitched IP rtlsim diff --git a/finn-rtllib/pwpolyf/hdl/pwpolyf_template_wrapper.v b/finn-rtllib/pwpolyf/hdl/pwpolyf_template_wrapper.v index eecf2ac74d..9bbbaa0987 100644 --- a/finn-rtllib/pwpolyf/hdl/pwpolyf_template_wrapper.v +++ b/finn-rtllib/pwpolyf/hdl/pwpolyf_template_wrapper.v @@ -35,7 +35,7 @@ module $MODULE_NAME_AXI_WRAPPER$ #( parameter PE = $PE$, parameter FUNC = $FUNC$ )( - (* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF in0_V:out_V, ASSOCIATED_RESET ap_rst_n" *) + (* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF in0_V:out0_V, ASSOCIATED_RESET ap_rst_n" *) (* X_INTERFACE_INFO = "xilinx.com:signal:clock:1.0 ap_clk CLK" *) input ap_clk, (* X_INTERFACE_PARAMETER = "POLARITY ACTIVE_LOW" *) @@ -47,9 +47,9 @@ module $MODULE_NAME_AXI_WRAPPER$ #( input [$IN_WIDTH$-1:0] in0_V_TDATA, //- AXI Stream - Output ------------- - input out_V_TREADY, - output out_V_TVALID, - output [$OUT_WIDTH$-1:0] out_V_TDATA + input out0_V_TREADY, + output out0_V_TVALID, + output [$OUT_WIDTH$-1:0] out0_V_TDATA ); pwpolyf #( @@ -61,9 +61,9 @@ module $MODULE_NAME_AXI_WRAPPER$ #( .xdat(in0_V_TDATA), .xvld(in0_V_TVALID), .xrdy(in0_V_TREADY), - .ydat(out_V_TDATA), - .yvld(out_V_TVALID), - .yrdy(out_V_TREADY) + .ydat(out0_V_TDATA), + .yvld(out0_V_TVALID), + .yrdy(out0_V_TREADY) ); endmodule // $MODULE_NAME_AXI_WRAPPER$ diff --git a/finn_xsi/finn_xsi/adapter.py b/finn_xsi/finn_xsi/adapter.py index 0b73787a60..a10d7bde9c 100644 --- a/finn_xsi/finn_xsi/adapter.py +++ b/finn_xsi/finn_xsi/adapter.py @@ -47,7 +47,7 @@ def compile_sim_obj(top_module_name, source_list, sim_out_dir, debug=False, beha # sort src list so that packages are loaded first # these packages must be compiled before modules that depend on them - pkg_patterns = ["swg_pkg", "mvu_pkg"] + pkg_patterns = ["swg_pkg", "mvu_pkg", "pwpolyf_pkg"] srcs_list = sorted( source_list, key=lambda s: (not any(pkg in s for pkg in pkg_patterns), s) ) diff --git a/src/finn/custom_op/fpgadataflow/pwpolyf.py b/src/finn/custom_op/fpgadataflow/pwpolyf.py index d412e1669a..48b5f33fd9 100644 --- a/src/finn/custom_op/fpgadataflow/pwpolyf.py +++ b/src/finn/custom_op/fpgadataflow/pwpolyf.py @@ -59,6 +59,8 @@ def get_nodeattr_types(self): # FINN DataTypes for inputs, outputs (always FLOAT32) "inputDataType": ("s", True, ""), "outputDataType": ("s", True, ""), + # polynomial degree (number of FMA stages per PE) + "degree": ("i", False, 2), # number of input vectors, examples: # [1] is a single vector (like a FC layer with batch=1) # [4] is four vectors (like a FC layer with batch=4) @@ -155,7 +157,8 @@ def get_exp_cycles(self): def lut_estimation(self): pe = self.get_nodeattr("PE") - return 200 * pe + degree = self.get_nodeattr("degree") + return 100 * degree * pe def bram_estimation(self): # coefficients stored in LUT ROM, not BRAM @@ -165,9 +168,9 @@ def uram_estimation(self): return 0 def dsp_estimation(self, fpgapart=None): - # two DSPFP32 FMA instances per PE (Horner evaluation) pe = self.get_nodeattr("PE") - return 2 * pe + degree = self.get_nodeattr("degree") + return degree * pe def execute_node(self, context, graph): node = self.onnx_node @@ -181,7 +184,8 @@ def execute_node(self, context, graph): from finn.util.pwpolyf import PiecewisePolyActivation # noqa: PLC0415 - mod = PiecewisePolyActivation(func, K=K) + degree = self.get_nodeattr("degree") + mod = PiecewisePolyActivation(func, K=K, degree=degree) with torch.no_grad(): x = torch.from_numpy(inp.astype(np.float32)) y = mod(x) diff --git a/src/finn/custom_op/fpgadataflow/rtl/pwpolyf_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/pwpolyf_rtl.py index 6bd80dd0df..f9ee038214 100644 --- a/src/finn/custom_op/fpgadataflow/rtl/pwpolyf_rtl.py +++ b/src/finn/custom_op/fpgadataflow/rtl/pwpolyf_rtl.py @@ -40,13 +40,12 @@ def _float_to_hex(f): return "%08X" % struct.unpack("!I", struct.pack("!f", float(f)))[0] -def generate_coeffs_pkg(K, num_samples=1000): +def generate_coeffs_pkg(K, degree=2, num_samples=1000): """Generate the pwpolyf_pkg.sv package content for a given K value. Produces a SystemVerilog package with a func_cfg_t struct per activation function, containing clamping parameters and polynomial coefficients. """ - degree = 2 num_subs = 1 << K num_segs = 1 + 2 * NUM_OCTAVES * num_subs @@ -72,7 +71,7 @@ def generate_coeffs_pkg(K, num_samples=1000): for func_name in SUPPORTED_FUNCS: cfg = CLAMP_CFG[func_name] - coeffs = _fit_coefficients(func_name, K, num_samples) + coeffs = _fit_coefficients(func_name, K, degree=degree, num_samples=num_samples) label = func_name.upper() neg_hex = _float_to_hex(cfg["neg_clamp"]) pos_hex = _float_to_hex(cfg["pos_clamp"]) @@ -123,8 +122,8 @@ def generate_hdl(self, model, fpgapart, clk): self.set_nodeattr("gen_top_module", topname) code_gen_dict = { - "$MODULE_NAME_AXI_WRAPPER$": topname + "_axi_wrapper", - "$TOP_MODULE$": topname + "_axi_wrapper", + "$MODULE_NAME_AXI_WRAPPER$": topname, + "$TOP_MODULE$": topname, "$PE$": str(pe), "$FUNC$": '"%s"' % func, "$IN_WIDTH$": str(pe * 32), @@ -143,9 +142,10 @@ def generate_hdl(self, model, fpgapart, clk): for sv_file in ["pwpolyf.sv", "queue.sv"]: shutil.copy(rtllib_dir + sv_file, code_gen_dir) - # generate package with coefficients matching the node's K value + # generate package with coefficients matching the node's K and degree K = self.get_nodeattr("K") - pkg_data = generate_coeffs_pkg(K) + degree = self.get_nodeattr("degree") + pkg_data = generate_coeffs_pkg(K, degree=degree) with open(os.path.join(code_gen_dir, "pwpolyf_pkg.sv"), "w") as f: f.write(pkg_data) diff --git a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py index 3f714d7ae7..abc5f68b5b 100644 --- a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py +++ b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py @@ -317,7 +317,9 @@ def _match_erf_gelu(self, model, erf_node): return (gelu_input, mul_x.output[0], nodes_to_remove) @staticmethod - def _make_pwpolyf_node(pwp_input, pwp_output, func, in_shape, idt, name, K=3): + def _make_pwpolyf_node( + pwp_input, pwp_output, func, in_shape, idt, name, K=3, degree=2 + ): num_channels = in_shape[-1] return helper.make_node( "PWPolyF", @@ -327,6 +329,7 @@ def _make_pwpolyf_node(pwp_input, pwp_output, func, in_shape, idt, name, K=3): backend="fpgadataflow", func=func, K=K, + degree=degree, NumChannels=num_channels, PE=1, inputDataType=idt.name, diff --git a/src/finn/util/pwpolyf.py b/src/finn/util/pwpolyf.py index da3f65e246..9fd82c570c 100644 --- a/src/finn/util/pwpolyf.py +++ b/src/finn/util/pwpolyf.py @@ -91,19 +91,19 @@ def _segment_boundaries(K): return bounds -def _fit_coefficients(func_name, K, num_samples=1000): - """Fit degree-2 polynomials per segment. Returns (NUM_SEGS, 3) tensor.""" +def _fit_coefficients(func_name, K, degree=2, num_samples=1000): + """Fit degree-N polynomials per segment. Returns (NUM_SEGS, degree+1) tensor.""" ref_fn = REFERENCE_FUNCS[func_name] bounds = _segment_boundaries(K) num_segs = len(bounds) - coeffs = np.zeros((num_segs, 3), dtype=np.float64) + coeffs = np.zeros((num_segs, degree + 1), dtype=np.float64) for seg, (lo, hi) in enumerate(bounds): xs = np.linspace(lo, hi, num_samples, dtype=np.float64) with torch.no_grad(): ys = ref_fn(torch.from_numpy(xs).float()).numpy().astype(np.float64) - c = np.polynomial.polynomial.polyfit(xs, ys, deg=2) - coeffs[seg] = c[:3] + c = np.polynomial.polynomial.polyfit(xs, ys, deg=degree) + coeffs[seg] = c[: degree + 1] return torch.from_numpy(coeffs.astype(np.float32)) @@ -146,6 +146,7 @@ class PWPolyFFunction(torch.autograd.Function): def forward(ctx, x, coeffs, neg_clamp_val, pos_clamp_val, func, K): num_subs = 1 << K num_segs = 1 + 2 * NUM_OCTAVES * num_subs + degree = coeffs.shape[1] - 1 pos_passthrough = CLAMP_CFG[func]["pos_passthrough"] orig_shape = x.shape @@ -154,11 +155,10 @@ def forward(ctx, x, coeffs, neg_clamp_val, pos_clamp_val, func, K): seg_idx, is_neg_clamp, is_pos_clamp = _segment_index(x_flat, K, num_subs, num_segs) c = coeffs[seg_idx] - a0 = c[:, 0] - a1 = c[:, 1] - a2 = c[:, 2] - - y = a0 + x_flat * (a1 + a2 * x_flat) + # Horner evaluation: y = c0 + x*(c1 + x*(c2 + ...)) + y = c[:, degree] + for i in range(degree - 1, -1, -1): + y = c[:, i] + x_flat * y if pos_passthrough: pos_val = x_flat @@ -183,18 +183,19 @@ class PiecewisePolyActivation(nn.Module): Emits a single PWPolyF custom op node during ONNX export. """ - def __init__(self, func="gelu", K=3, fit_samples=1000): + def __init__(self, func="gelu", K=3, degree=2, fit_samples=1000): super().__init__() if func not in SUPPORTED_FUNCS: raise ValueError("Unsupported func=%r; choose from %s" % (func, SUPPORTED_FUNCS)) self.func = func self.K = K + self.degree = degree self.num_subs = 1 << K self.num_segs = 1 + 2 * NUM_OCTAVES * self.num_subs self.pos_passthrough = CLAMP_CFG[func]["pos_passthrough"] - coeffs = _fit_coefficients(func, K, fit_samples) + coeffs = _fit_coefficients(func, K, degree=degree, num_samples=fit_samples) self.register_buffer("coeffs", coeffs) neg_cv = torch.tensor(CLAMP_CFG[func]["neg_clamp"], dtype=torch.float32) @@ -221,12 +222,10 @@ def forward(self, x): ) c = self.coeffs[seg_idx] - a0 = c[:, 0] - a1 = c[:, 1] - a2 = c[:, 2] - - # Horner: y = a0 + x*(a1 + a2*x) - y = a0 + x_flat * (a1 + a2 * x_flat) + # Horner evaluation: y = c0 + x*(c1 + x*(c2 + ...)) + y = c[:, self.degree] + for i in range(self.degree - 1, -1, -1): + y = c[:, i] + x_flat * y if self.pos_passthrough: pos_val = x_flat diff --git a/tests/fpgadataflow/test_fpgadataflow_pwpolyf.py b/tests/fpgadataflow/test_fpgadataflow_pwpolyf.py index a36117b90d..8e333ccd08 100644 --- a/tests/fpgadataflow/test_fpgadataflow_pwpolyf.py +++ b/tests/fpgadataflow/test_fpgadataflow_pwpolyf.py @@ -41,11 +41,19 @@ import finn.core.onnx_exec as oxe from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer +from finn.custom_op.fpgadataflow.rtl.pwpolyf_rtl import generate_coeffs_pkg from finn.transformation.fpgadataflow.convert_to_hw_layers import InferPWPolyFLayer +from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP +from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP +from finn.transformation.fpgadataflow.prepare_ip import PrepareIP +from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim +from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.set_fifo_depths import InsertAndSetFIFODepths from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers from finn.util.pwpolyf import PiecewisePolyActivation test_fpga_part = "xczu3eg-sbva484-1-e" +target_clk_ns = 5 def make_pwpolyf_modelwrapper(func, K, num_channels, num_input_vecs): @@ -220,17 +228,19 @@ def test_pwpolyf_specialize_rtl(func): @pytest.mark.parametrize("func", ["gelu", "tanh"]) @pytest.mark.parametrize("pe", [1, 2, 4]) +@pytest.mark.parametrize("degree", [2, 3]) @pytest.mark.fpgadataflow -def test_pwpolyf_resource_estimates(func, pe): +def test_pwpolyf_resource_estimates(func, pe, degree): K = 3 num_channels = 8 model = make_pwpolyf_modelwrapper(func, K, num_channels, [1]) node = model.graph.node[0] inst = getCustomOp(node) inst.set_nodeattr("PE", pe) + inst.set_nodeattr("degree", degree) - assert inst.dsp_estimation() == 2 * pe - assert inst.lut_estimation() == 200 * pe + assert inst.dsp_estimation() == degree * pe + assert inst.lut_estimation() == 100 * degree * pe assert inst.bram_estimation() == 0 assert inst.uram_estimation() == 0 @@ -563,3 +573,174 @@ def test_pwpolyf_erf_gelu_execution(): with torch.no_grad(): y_expected = ref_mod(torch.from_numpy(x)).numpy() assert np.allclose(y_produced, y_expected, atol=1e-6) + + +# ---------- coefficient package smoketests ---------- + + +@pytest.mark.parametrize("K", [2, 3, 4]) +@pytest.mark.fpgadataflow +def test_pwpolyf_generate_coeffs_pkg(K): + """Verify generate_coeffs_pkg produces valid SystemVerilog package.""" + pkg = generate_coeffs_pkg(K) + + assert "package pwpolyf_pkg" in pkg + assert "endpackage" in pkg + # localparam lines use padded alignment in the generated SV + assert "DEGREE = 2;" in pkg + assert "K = %d;" % K in pkg + + num_segs = 1 + 2 * 5 * (1 << K) + assert "NUM_SEGS = %d;" % num_segs in pkg + + for func_label in ["GELU", "SILU", "SIGMOID", "TANH"]: + assert func_label + " = '{" in pkg + + seg_lines = [line for line in pkg.split("\n") if "// seg" in line] + # Each function has num_segs segments, 4 functions total + assert len(seg_lines) == 4 * num_segs + + +@pytest.mark.parametrize("degree", [1, 2, 3]) +@pytest.mark.fpgadataflow +def test_pwpolyf_generate_coeffs_pkg_degree(degree): + """Verify generate_coeffs_pkg respects degree parameter.""" + K = 3 + pkg = generate_coeffs_pkg(K, degree=degree) + + assert "DEGREE = %d;" % degree in pkg + # Each segment line should have degree+1 coefficient values + seg_lines = [line for line in pkg.split("\n") if "// seg 0" in line] + for line in seg_lines: + hex_vals = [s for s in line.split() if s.startswith("32'h")] + assert len(hex_vals) == degree + 1 + + +# ---------- generate_hdl smoketests ---------- + + +@pytest.mark.parametrize("func", ["gelu", "tanh"]) +@pytest.mark.parametrize("pe", [1, 2]) +@pytest.mark.fpgadataflow +@pytest.mark.vivado +def test_pwpolyf_generate_hdl(func, pe): + """Verify generate_hdl produces expected RTL files.""" + num_channels = 4 + model = make_pwpolyf_modelwrapper(func, 3, num_channels, [1]) + model = model.transform(SpecializeLayers(test_fpga_part)) + model = model.transform(GiveUniqueNodeNames()) + + node = model.graph.node[0] + assert node.op_type == "PWPolyF_rtl" + inst = getCustomOp(node) + inst.set_nodeattr("PE", pe) + + model = model.transform(PrepareIP(test_fpga_part, target_clk_ns)) + + # Re-fetch node after transform (PrepareIP returns a new model) + node = model.graph.node[0] + inst = getCustomOp(node) + + code_gen_dir = inst.get_nodeattr("code_gen_dir_ipgen") + assert code_gen_dir, "code_gen_dir_ipgen not set after PrepareIP" + assert os.path.isfile(os.path.join(code_gen_dir, "pwpolyf_pkg.sv")) + assert os.path.isfile(os.path.join(code_gen_dir, "pwpolyf.sv")) + assert os.path.isfile(os.path.join(code_gen_dir, "queue.sv")) + + topname = inst.get_nodeattr("gen_top_module") + assert os.path.isfile(os.path.join(code_gen_dir, topname + ".v")) + + # Verify package content + with open(os.path.join(code_gen_dir, "pwpolyf_pkg.sv"), "r") as f: + pkg_content = f.read() + assert "DEGREE = 2;" in pkg_content + assert "K = 3;" in pkg_content + assert func.upper() + " = '{" in pkg_content + + +# ---------- RTL simulation tests ---------- + + +@pytest.mark.parametrize("func", ["gelu", "sigmoid"]) +@pytest.mark.parametrize("num_channels", [4, 8]) +@pytest.mark.parametrize("pe", [1, 2, 4]) +@pytest.mark.fpgadataflow +@pytest.mark.vivado +@pytest.mark.slow +def test_pwpolyf_rtlsim(func, num_channels, pe): + """Node-by-node RTL simulation of PWPolyF_rtl.""" + if num_channels % pe != 0: + pytest.skip("PE does not divide NumChannels") + + K = 3 + model = make_pwpolyf_modelwrapper(func, K, num_channels, [1]) + + # Get cppsim reference output + x = np.random.uniform(-5, 5, (1, num_channels)).astype(np.float32) + input_dict = {"inp": x} + y_ref = oxe.execute_onnx(model, input_dict)["outp"] + + # Specialize to RTL and set PE + model = model.transform(SpecializeLayers(test_fpga_part)) + model = model.transform(GiveUniqueNodeNames()) + node = model.graph.node[0] + assert node.op_type == "PWPolyF_rtl" + inst = getCustomOp(node) + inst.set_nodeattr("PE", pe) + + # RTL simulation pipeline + model = model.transform(SetExecMode("rtlsim")) + model = model.transform(PrepareIP(test_fpga_part, target_clk_ns)) + model = model.transform(HLSSynthIP()) + model = model.transform(PrepareRTLSim()) + + y_rtl = oxe.execute_onnx(model, input_dict)["outp"] + assert np.allclose(y_ref, y_rtl, atol=1e-4), ( + "RTL output does not match cppsim reference" + ) + + # Verify cycle count (re-fetch node after transforms) + node = model.graph.node[0] + inst = getCustomOp(node) + cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim") + exp_cycles_dict = model.analysis(exp_cycles_per_layer) + exp_cycles = exp_cycles_dict[node.name] + assert np.isclose(exp_cycles, cycles_rtlsim, atol=15) + assert exp_cycles != 0 + + +@pytest.mark.parametrize("func", ["gelu", "sigmoid"]) +@pytest.mark.parametrize("pe", [1, 2]) +@pytest.mark.fpgadataflow +@pytest.mark.vivado +@pytest.mark.slow +def test_pwpolyf_rtlsim_stitched_ip(func, pe): + """Stitched IP RTL simulation of PWPolyF_rtl.""" + K = 3 + num_channels = 4 + model = make_pwpolyf_modelwrapper(func, K, num_channels, [1]) + + # Get cppsim reference output + x = np.random.uniform(-5, 5, (1, num_channels)).astype(np.float32) + input_dict = {model.graph.input[0].name: x} + y_ref = oxe.execute_onnx(model, input_dict)[model.graph.output[0].name] + + # Specialize to RTL and set PE + model = model.transform(SpecializeLayers(test_fpga_part)) + model = model.transform(GiveUniqueNodeNames()) + node = model.graph.node[0] + inst = getCustomOp(node) + inst.set_nodeattr("PE", pe) + + # Stitched IP pipeline + model = model.transform(InsertAndSetFIFODepths(test_fpga_part, target_clk_ns)) + model = model.transform(PrepareIP(test_fpga_part, target_clk_ns)) + model = model.transform(HLSSynthIP()) + model = model.transform(CreateStitchedIP(test_fpga_part, target_clk_ns)) + model.set_metadata_prop("exec_mode", "rtlsim") + + input_dict = {model.graph.input[0].name: x} + y_rtl = oxe.execute_onnx(model, input_dict)[model.graph.output[0].name] + assert np.allclose(y_ref, y_rtl, atol=1e-4), ( + "Stitched IP output does not match cppsim reference" + ) From c23097b92d43ed67371c4fd69bd20ee078bbb683 Mon Sep 17 00:00:00 2001 From: ollycassidy13 Date: Tue, 28 Apr 2026 13:28:12 +0100 Subject: [PATCH 08/12] versal check --- docs/finn/pwpolyf.md | 26 +++++++++++-------- src/finn/custom_op/fpgadataflow/pwpolyf.py | 26 ++++++++++++++++--- .../fpgadataflow/specialize_layers.py | 26 +++++++++++++++++++ .../fpgadataflow/test_fpgadataflow_pwpolyf.py | 19 ++++++++++---- 4 files changed, 78 insertions(+), 19 deletions(-) diff --git a/docs/finn/pwpolyf.md b/docs/finn/pwpolyf.md index 11b86e88a4..0fd89e5738 100644 --- a/docs/finn/pwpolyf.md +++ b/docs/finn/pwpolyf.md @@ -5,10 +5,10 @@ PWPolyF is a hardware activation layer that approximates nonlinear functions (GELU, SiLU, Sigmoid, Tanh) using piecewise polynomials evaluated via Horner's method on a chain of DSPFP32 FMA units. With the default degree 2, this uses -two cascaded DSPs per PE, giving single-cycle-per-element throughput with no -BRAM usage. Per-function configuration (clamping behaviour and polynomial -coefficients) is delivered through a SystemVerilog package (`pwpolyf_pkg`) -using a `func_cfg_t` struct. +two cascaded DSPs and one RAMB18 coefficient ROM per PE, giving +single-cycle-per-element throughput. Per-function configuration (clamping +behaviour and polynomial coefficients) is delivered through a SystemVerilog +package (`pwpolyf_pkg`) using a `func_cfg_t` struct. The input domain is partitioned into `1 + 2*5*(2^K)` segments: one near-zero region, positive octave sub-segments, and negative mirrors. With the default @@ -25,7 +25,9 @@ ONNX ops. ## Architecture -PWPolyF is **RTL-only** (no HLS variant). Two export paths are supported: +PWPolyF is **RTL-only** (no HLS variant) and targets Versal devices only, +since the RTL instantiates the Versal DSPFP32 primitive. Two export paths are +supported: ``` Path A: PiecewisePolyActivation Path B: nn.GELU / nn.SiLU / etc. @@ -78,17 +80,19 @@ PWPolyF uses PE parallelism. `NumChannels % PE == 0` must hold. Each PE instantiates its own polynomial evaluation pipeline (`degree` DSPs). `SetFolding` handles PE selection automatically. -| PE | Degree | DSPs | Approx LUTs | Cycles (per spatial position) | -|----|--------|------------|-------------------|-------------------------------| -| 1 | 2 | 2 | 200 | NumChannels | -| C | 2 | 2C | 200C | 1 | -| 1 | 3 | 3 | 300 | NumChannels | +| PE | Degree | DSPs | BRAM18s | Approx LUTs | Cycles (per spatial position) | +|----|--------|------------|-------------------|------------------|-------------------------------| +| 1 | 2 | 2 | 1 | 200 | NumChannels | +| C | 2 | 2C | C | 200C | 1 | +| 1 | 3 | 3 | 2 | 300 | NumChannels | ## Resource estimates - **DSP:** `degree * PE` (one FP32 FMA stage per polynomial degree per PE) - **LUT:** `~100 * degree * PE` (segment address decode + control) -- **BRAM/URAM:** 0 (coefficients stored in LUT/registers) +- **BRAM18:** `(degree - 1) * PE` for default `K=3` (Vivado infers delayed + coefficient lookups as 32-bit ROMs) +- **URAM:** 0 ## ONNX export diff --git a/src/finn/custom_op/fpgadataflow/pwpolyf.py b/src/finn/custom_op/fpgadataflow/pwpolyf.py index 48b5f33fd9..a9143984d5 100644 --- a/src/finn/custom_op/fpgadataflow/pwpolyf.py +++ b/src/finn/custom_op/fpgadataflow/pwpolyf.py @@ -26,6 +26,8 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import math + import numpy as np from qonnx.core.datatype import DataType @@ -40,7 +42,7 @@ class PWPolyF(HWCustomOp): """ HW op for piecewise polynomial activations (GELU, SiLU, Sigmoid, Tanh). - Element-wise FP32, coefficients baked into RTL. No weights or BRAM. + Element-wise FP32, coefficients baked into RTL. No weights. """ def __init__(self, onnx_node, **kwargs): @@ -161,8 +163,26 @@ def lut_estimation(self): return 100 * degree * pe def bram_estimation(self): - # coefficients stored in LUT ROM, not BRAM - return 0 + pe = self.get_nodeattr("PE") + degree = self.get_nodeattr("degree") + num_segs = self.get_num_segments() + + if degree <= 1: + return 0 + + # Stages after the first use a registered dynamic coefficient lookup + # for the DSP C input. Vivado infers this as one 32-bit wide ROM per + # stage and PE, backed by RAMB18 for the default K=3 table depth. + coeff_width = 32 + if coeff_width <= 18 or num_segs > 512: + bram18_per_coeff_rom = math.ceil(num_segs / 1024) * math.ceil( + coeff_width / 18 + ) + else: + bram18_per_coeff_rom = math.ceil(num_segs / 512) * math.ceil( + coeff_width / 36 + ) + return pe * (degree - 1) * bram18_per_coeff_rom def uram_estimation(self): return 0 diff --git a/src/finn/transformation/fpgadataflow/specialize_layers.py b/src/finn/transformation/fpgadataflow/specialize_layers.py index dcd2472e0a..5c0dd3a0cb 100644 --- a/src/finn/transformation/fpgadataflow/specialize_layers.py +++ b/src/finn/transformation/fpgadataflow/specialize_layers.py @@ -82,6 +82,11 @@ def _determine_impl_style(node, fpgapart, model): return "rtl" else: return "hls" + elif optype == "PWPolyF": + if _pwpolyf_rtl_possible(node, fpgapart): + return "rtl" + else: + _raise_pwpolyf_unsupported(node, fpgapart) elif optype == "Requant": if _requant_rtl_possible(node, fpgapart): return "rtl" @@ -106,6 +111,8 @@ def _determine_impl_style(node, fpgapart, model): if hls_variant: return "hls" elif rtl_variant: + if optype == "PWPolyF" and not _pwpolyf_rtl_possible(node, fpgapart): + _raise_pwpolyf_unsupported(node, fpgapart) warn_str = """There is no HLS variant of %s. Node %s will automatically be set to RTL variant.""" % ( node.op_type, @@ -158,6 +165,11 @@ def _determine_impl_style(node, fpgapart, model): warnings.warn(warn_str) return "hls" + elif optype == "PWPolyF": + if _pwpolyf_rtl_possible(node, fpgapart): + return "rtl" + else: + _raise_pwpolyf_unsupported(node, fpgapart) elif optype == "LayerNorm": if _layernorm_rtl_possible(node, fpgapart): return "rtl" @@ -346,6 +358,20 @@ def _layernorm_rtl_possible(n, fpgapart): return True +def _pwpolyf_rtl_possible(n, fpgapart): + # PWPolyF uses the Versal DSPFP32 primitive. + return is_versal(fpgapart) + + +def _raise_pwpolyf_unsupported(n, fpgapart): + raise Exception( + """PWPolyF node %s cannot be specialized for FPGA part %s. + PWPolyF_rtl uses the Versal DSPFP32 primitive and is only supported + on Versal devices.""" + % (n.name, fpgapart) + ) + + def _requant_rtl_possible(n, fpgapart): # Checks whether RTL-based Requant is supported # RTL Requant requires: diff --git a/tests/fpgadataflow/test_fpgadataflow_pwpolyf.py b/tests/fpgadataflow/test_fpgadataflow_pwpolyf.py index 8e333ccd08..2b4f1e119e 100644 --- a/tests/fpgadataflow/test_fpgadataflow_pwpolyf.py +++ b/tests/fpgadataflow/test_fpgadataflow_pwpolyf.py @@ -52,7 +52,8 @@ from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers from finn.util.pwpolyf import PiecewisePolyActivation -test_fpga_part = "xczu3eg-sbva484-1-e" +test_fpga_part = "xcve2002-sbva484-2MP-e-S" +non_versal_fpga_part = "xczu3eg-sbva484-1-e" target_clk_ns = 5 @@ -226,12 +227,20 @@ def test_pwpolyf_specialize_rtl(func): assert inst.get_nodeattr("K") == K +@pytest.mark.fpgadataflow +def test_pwpolyf_specialize_rejects_non_versal(): + model = make_pwpolyf_modelwrapper("gelu", 3, 8, [1]) + + with pytest.raises(Exception, match="Versal"): + model.transform(SpecializeLayers(non_versal_fpga_part)) + + @pytest.mark.parametrize("func", ["gelu", "tanh"]) @pytest.mark.parametrize("pe", [1, 2, 4]) -@pytest.mark.parametrize("degree", [2, 3]) +@pytest.mark.parametrize("degree", [1, 2, 3]) +@pytest.mark.parametrize("K, bram18_per_coeff_rom", [(3, 1), (6, 2)]) @pytest.mark.fpgadataflow -def test_pwpolyf_resource_estimates(func, pe, degree): - K = 3 +def test_pwpolyf_resource_estimates(func, pe, degree, K, bram18_per_coeff_rom): num_channels = 8 model = make_pwpolyf_modelwrapper(func, K, num_channels, [1]) node = model.graph.node[0] @@ -241,7 +250,7 @@ def test_pwpolyf_resource_estimates(func, pe, degree): assert inst.dsp_estimation() == degree * pe assert inst.lut_estimation() == 100 * degree * pe - assert inst.bram_estimation() == 0 + assert inst.bram_estimation() == max(degree - 1, 0) * pe * bram18_per_coeff_rom assert inst.uram_estimation() == 0 From 7d56e906a7eb1859df779133543a5f0a162e6f5f Mon Sep 17 00:00:00 2001 From: ollycassidy13 Date: Wed, 29 Apr 2026 09:59:16 +0100 Subject: [PATCH 09/12] linting --- src/finn/custom_op/fpgadataflow/pwpolyf.py | 9 ++------- .../fpgadataflow/convert_to_hw_layers.py | 4 +--- tests/fpgadataflow/test_fpgadataflow_pwpolyf.py | 10 ++++------ 3 files changed, 7 insertions(+), 16 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/pwpolyf.py b/src/finn/custom_op/fpgadataflow/pwpolyf.py index a9143984d5..206fd3690c 100644 --- a/src/finn/custom_op/fpgadataflow/pwpolyf.py +++ b/src/finn/custom_op/fpgadataflow/pwpolyf.py @@ -27,7 +27,6 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import math - import numpy as np from qonnx.core.datatype import DataType @@ -175,13 +174,9 @@ def bram_estimation(self): # stage and PE, backed by RAMB18 for the default K=3 table depth. coeff_width = 32 if coeff_width <= 18 or num_segs > 512: - bram18_per_coeff_rom = math.ceil(num_segs / 1024) * math.ceil( - coeff_width / 18 - ) + bram18_per_coeff_rom = math.ceil(num_segs / 1024) * math.ceil(coeff_width / 18) else: - bram18_per_coeff_rom = math.ceil(num_segs / 512) * math.ceil( - coeff_width / 36 - ) + bram18_per_coeff_rom = math.ceil(num_segs / 512) * math.ceil(coeff_width / 36) return pe * (degree - 1) * bram18_per_coeff_rom def uram_estimation(self): diff --git a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py index abc5f68b5b..dc09b3daeb 100644 --- a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py +++ b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py @@ -317,9 +317,7 @@ def _match_erf_gelu(self, model, erf_node): return (gelu_input, mul_x.output[0], nodes_to_remove) @staticmethod - def _make_pwpolyf_node( - pwp_input, pwp_output, func, in_shape, idt, name, K=3, degree=2 - ): + def _make_pwpolyf_node(pwp_input, pwp_output, func, in_shape, idt, name, K=3, degree=2): num_channels = in_shape[-1] return helper.make_node( "PWPolyF", diff --git a/tests/fpgadataflow/test_fpgadataflow_pwpolyf.py b/tests/fpgadataflow/test_fpgadataflow_pwpolyf.py index 2b4f1e119e..4a1b656631 100644 --- a/tests/fpgadataflow/test_fpgadataflow_pwpolyf.py +++ b/tests/fpgadataflow/test_fpgadataflow_pwpolyf.py @@ -704,9 +704,7 @@ def test_pwpolyf_rtlsim(func, num_channels, pe): model = model.transform(PrepareRTLSim()) y_rtl = oxe.execute_onnx(model, input_dict)["outp"] - assert np.allclose(y_ref, y_rtl, atol=1e-4), ( - "RTL output does not match cppsim reference" - ) + assert np.allclose(y_ref, y_rtl, atol=1e-4), "RTL output does not match cppsim reference" # Verify cycle count (re-fetch node after transforms) node = model.graph.node[0] @@ -750,6 +748,6 @@ def test_pwpolyf_rtlsim_stitched_ip(func, pe): input_dict = {model.graph.input[0].name: x} y_rtl = oxe.execute_onnx(model, input_dict)[model.graph.output[0].name] - assert np.allclose(y_ref, y_rtl, atol=1e-4), ( - "Stitched IP output does not match cppsim reference" - ) + assert np.allclose( + y_ref, y_rtl, atol=1e-4 + ), "Stitched IP output does not match cppsim reference" From 1b6692d5c75d0febbc0205f40037462b99a695cc Mon Sep 17 00:00:00 2001 From: ollycassidy13 Date: Mon, 18 May 2026 15:29:16 +0100 Subject: [PATCH 10/12] move pwpolyf torch module --- docs/finn/pwpolyf.md | 3 +- docs/finn/source_code/finn.util.rst | 9 + src/finn/custom_op/fpgadataflow/pwpolyf.py | 2 +- .../custom_op/fpgadataflow/rtl/pwpolyf_rtl.py | 7 +- src/finn/util/pwpolyf.py | 239 +++--------------- src/finn/util/torch_hw_modules.py | 236 +++++++++++++++++ .../fpgadataflow/test_fpgadataflow_pwpolyf.py | 2 +- 7 files changed, 288 insertions(+), 210 deletions(-) create mode 100644 src/finn/util/torch_hw_modules.py diff --git a/docs/finn/pwpolyf.md b/docs/finn/pwpolyf.md index 0fd89e5738..cf3aaeeeb5 100644 --- a/docs/finn/pwpolyf.md +++ b/docs/finn/pwpolyf.md @@ -138,9 +138,10 @@ Attributes on the explicit PWPolyF ONNX node: | File | Purpose | |------|---------| +| `util/torch_hw_modules.py` | PyTorch activation module, ONNX export, software simulation | | `custom_op/fpgadataflow/pwpolyf.py` | Base HW op (shape, folding, resource estimates, cppsim) | | `custom_op/fpgadataflow/rtl/pwpolyf_rtl.py` | RTL backend (HDL generation, package generation, rtlsim, IPI) | -| `util/pwpolyf.py` | PyTorch activation module, ONNX export, software simulation | +| `util/pwpolyf.py` | Compatibility imports for existing PWPolyF utility users | | `transformation/fpgadataflow/convert_to_hw_layers.py` | `InferPWPolyFLayer` transformation | | `builder/build_dataflow_steps.py` | Build pipeline integration | | `transformation/fpgadataflow/set_folding.py` | Folding support (pe_ops list) | diff --git a/docs/finn/source_code/finn.util.rst b/docs/finn/source_code/finn.util.rst index a06d55d81e..5ceebc2436 100644 --- a/docs/finn/source_code/finn.util.rst +++ b/docs/finn/source_code/finn.util.rst @@ -188,6 +188,15 @@ finn.util.pytorch :show-inheritance: +finn.util.torch_hw_modules +--------------------------- + +.. automodule:: finn.util.torch_hw_modules + :members: + :undoc-members: + :show-inheritance: + + finn.util.pwpolyf ------------------- diff --git a/src/finn/custom_op/fpgadataflow/pwpolyf.py b/src/finn/custom_op/fpgadataflow/pwpolyf.py index 206fd3690c..746ebdeb38 100644 --- a/src/finn/custom_op/fpgadataflow/pwpolyf.py +++ b/src/finn/custom_op/fpgadataflow/pwpolyf.py @@ -197,7 +197,7 @@ def execute_node(self, context, graph): # lazy import to avoid hard dependency on torch at module level import torch # noqa: PLC0415 - from finn.util.pwpolyf import PiecewisePolyActivation # noqa: PLC0415 + from finn.util.torch_hw_modules import PiecewisePolyActivation # noqa: PLC0415 degree = self.get_nodeattr("degree") mod = PiecewisePolyActivation(func, K=K, degree=degree) diff --git a/src/finn/custom_op/fpgadataflow/rtl/pwpolyf_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/pwpolyf_rtl.py index f9ee038214..3411c81a8b 100644 --- a/src/finn/custom_op/fpgadataflow/rtl/pwpolyf_rtl.py +++ b/src/finn/custom_op/fpgadataflow/rtl/pwpolyf_rtl.py @@ -32,7 +32,12 @@ from finn.custom_op.fpgadataflow.pwpolyf import PWPolyF from finn.custom_op.fpgadataflow.rtlbackend import RTLBackend -from finn.util.pwpolyf import CLAMP_CFG, NUM_OCTAVES, SUPPORTED_FUNCS, _fit_coefficients +from finn.util.torch_hw_modules import ( + CLAMP_CFG, + NUM_OCTAVES, + SUPPORTED_FUNCS, + _fit_coefficients, +) def _float_to_hex(f): diff --git a/src/finn/util/pwpolyf.py b/src/finn/util/pwpolyf.py index 9fd82c570c..1972b0248a 100644 --- a/src/finn/util/pwpolyf.py +++ b/src/finn/util/pwpolyf.py @@ -26,212 +26,39 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -""" -Piecewise polynomial activation - PyTorch module and software model. +"""Compatibility imports for PWPolyF PyTorch utilities. -Drop-in activation that approximates GELU, SiLU, Sigmoid, and Tanh using -degree-2 polynomials, matching the pwpolyf RTL behaviour. Emits a single -PWPolyF custom op node during ONNX export (requires dynamo=False). +The canonical home for PyTorch modules that match FINN hardware behavior is +``finn.util.torch_hw_modules``. This module is kept to avoid breaking existing +imports while downstream code moves to the new location. """ -import numpy as np -import torch -import torch.nn as nn -import torch.nn.functional as F - -# Constants matching the SystemVerilog module -NUM_OCTAVES = 5 -EXP_BIAS = 127 -EXP_BASE = 125 -EXP_CLAMP = 130 - -SUPPORTED_FUNCS = ("gelu", "silu", "sigmoid", "tanh") - -REFERENCE_FUNCS = { - "gelu": lambda x: F.gelu(x), - "silu": lambda x: F.silu(x), - "sigmoid": lambda x: torch.sigmoid(x), - "tanh": lambda x: torch.tanh(x), -} - -CLAMP_CFG = { - "gelu": {"neg_clamp": 0.0, "pos_clamp": 0.0, "pos_passthrough": True}, - "silu": {"neg_clamp": 0.0, "pos_clamp": 0.0, "pos_passthrough": True}, - "sigmoid": {"neg_clamp": 0.0, "pos_clamp": 1.0, "pos_passthrough": False}, - "tanh": {"neg_clamp": -1.0, "pos_clamp": 1.0, "pos_passthrough": False}, -} - - -def _segment_boundaries(K): - """Return (lo, hi) bounds for every segment.""" - num_subs = 1 << K - bounds = [] - - # Segment 0: near-zero - bounds.append((-0.25, 0.25)) - - # Positive segments - for octave in range(NUM_OCTAVES): - exp_val = EXP_BASE + octave - EXP_BIAS - base = 2.0**exp_val - for sub in range(num_subs): - lo = base * (1.0 + sub / num_subs) - hi = base * (1.0 + (sub + 1) / num_subs) - bounds.append((lo, hi)) - - # Negative segments (mirror of positive) - for octave in range(NUM_OCTAVES): - exp_val = EXP_BASE + octave - EXP_BIAS - base = 2.0**exp_val - for sub in range(num_subs): - lo = base * (1.0 + sub / num_subs) - hi = base * (1.0 + (sub + 1) / num_subs) - bounds.append((-hi, -lo)) - - return bounds - - -def _fit_coefficients(func_name, K, degree=2, num_samples=1000): - """Fit degree-N polynomials per segment. Returns (NUM_SEGS, degree+1) tensor.""" - ref_fn = REFERENCE_FUNCS[func_name] - bounds = _segment_boundaries(K) - num_segs = len(bounds) - coeffs = np.zeros((num_segs, degree + 1), dtype=np.float64) - - for seg, (lo, hi) in enumerate(bounds): - xs = np.linspace(lo, hi, num_samples, dtype=np.float64) - with torch.no_grad(): - ys = ref_fn(torch.from_numpy(xs).float()).numpy().astype(np.float64) - c = np.polynomial.polynomial.polyfit(xs, ys, deg=degree) - coeffs[seg] = c[: degree + 1] - - return torch.from_numpy(coeffs.astype(np.float32)) - - -def _segment_index(x, K, num_subs, num_segs): - """Map each element to its polynomial segment, mirroring SV addressing.""" - abs_x = x.abs() - is_neg = x < 0 - - is_near_zero = abs_x < 0.25 - is_clamp = abs_x >= 8.0 - is_neg_clamp = is_neg & is_clamp - is_pos_clamp = (~is_neg) & is_clamp - - safe_abs = abs_x.clamp(min=0.25) - floor_log2 = torch.floor(torch.log2(safe_abs)) - octave = (floor_log2 + 2).long().clamp(0, NUM_OCTAVES - 1) - - pow2 = torch.exp2(floor_log2) - frac = safe_abs / pow2 - 1.0 - sub = (frac * num_subs).long().clamp(0, num_subs - 1) - - pos_idx = 1 + octave * num_subs + sub - neg_idx = 1 + NUM_OCTAVES * num_subs + octave * num_subs + sub - - seg_idx = torch.where( - is_near_zero, - torch.zeros_like(pos_idx), - torch.where(is_neg, neg_idx, pos_idx), - ) - seg_idx = seg_idx.clamp(0, num_segs - 1) - - return seg_idx, is_neg_clamp, is_pos_clamp - - -class PWPolyFFunction(torch.autograd.Function): - """Emits a single PWPolyF ONNX node during export.""" - - @staticmethod - def forward(ctx, x, coeffs, neg_clamp_val, pos_clamp_val, func, K): - num_subs = 1 << K - num_segs = 1 + 2 * NUM_OCTAVES * num_subs - degree = coeffs.shape[1] - 1 - pos_passthrough = CLAMP_CFG[func]["pos_passthrough"] - - orig_shape = x.shape - x_flat = x.contiguous().view(-1) - - seg_idx, is_neg_clamp, is_pos_clamp = _segment_index(x_flat, K, num_subs, num_segs) - - c = coeffs[seg_idx] - # Horner evaluation: y = c0 + x*(c1 + x*(c2 + ...)) - y = c[:, degree] - for i in range(degree - 1, -1, -1): - y = c[:, i] + x_flat * y - - if pos_passthrough: - pos_val = x_flat - else: - pos_val = pos_clamp_val.expand_as(y) - y = torch.where(is_pos_clamp, pos_val, y) - y = torch.where(is_neg_clamp, neg_clamp_val.expand_as(y), y) - - return y.view(orig_shape) - - @staticmethod - def symbolic(g, x, coeffs, neg_clamp_val, pos_clamp_val, func, K): - return g.op("PWPolyF", x, func_s=func, K_i=K) - - -class PiecewisePolyActivation(nn.Module): - """ - Drop-in activation matching the pwpolyf hardware behaviour. - - Approximates nonlinear activations using degree-2 polynomials over - segments defined by FP32 bit-extraction. Evaluated via Horner's method. - Emits a single PWPolyF custom op node during ONNX export. - """ - - def __init__(self, func="gelu", K=3, degree=2, fit_samples=1000): - super().__init__() - if func not in SUPPORTED_FUNCS: - raise ValueError("Unsupported func=%r; choose from %s" % (func, SUPPORTED_FUNCS)) - - self.func = func - self.K = K - self.degree = degree - self.num_subs = 1 << K - self.num_segs = 1 + 2 * NUM_OCTAVES * self.num_subs - self.pos_passthrough = CLAMP_CFG[func]["pos_passthrough"] - - coeffs = _fit_coefficients(func, K, degree=degree, num_samples=fit_samples) - self.register_buffer("coeffs", coeffs) - - neg_cv = torch.tensor(CLAMP_CFG[func]["neg_clamp"], dtype=torch.float32) - pos_cv = torch.tensor(CLAMP_CFG[func]["pos_clamp"], dtype=torch.float32) - self.register_buffer("neg_clamp_val", neg_cv) - self.register_buffer("pos_clamp_val", pos_cv) - - def forward(self, x): - if torch.onnx.is_in_onnx_export(): - return PWPolyFFunction.apply( - x, - self.coeffs, - self.neg_clamp_val, - self.pos_clamp_val, - self.func, - self.K, - ) - - orig_shape = x.shape - x_flat = x.contiguous().view(-1) - - seg_idx, is_neg_clamp, is_pos_clamp = _segment_index( - x_flat, self.K, self.num_subs, self.num_segs - ) - - c = self.coeffs[seg_idx] - # Horner evaluation: y = c0 + x*(c1 + x*(c2 + ...)) - y = c[:, self.degree] - for i in range(self.degree - 1, -1, -1): - y = c[:, i] + x_flat * y - - if self.pos_passthrough: - pos_val = x_flat - else: - pos_val = self.pos_clamp_val.expand_as(y) - y = torch.where(is_pos_clamp, pos_val, y) - y = torch.where(is_neg_clamp, self.neg_clamp_val.expand_as(y), y) - - return y.view(orig_shape) +from finn.util.torch_hw_modules import ( + CLAMP_CFG, + EXP_BASE, + EXP_BIAS, + EXP_CLAMP, + NUM_OCTAVES, + REFERENCE_FUNCS, + SUPPORTED_FUNCS, + PiecewisePolyActivation, + PWPolyFFunction, + _fit_coefficients, + _segment_boundaries, + _segment_index, +) + +__all__ = [ + "CLAMP_CFG", + "EXP_BIAS", + "EXP_BASE", + "EXP_CLAMP", + "NUM_OCTAVES", + "PWPolyFFunction", + "PiecewisePolyActivation", + "REFERENCE_FUNCS", + "SUPPORTED_FUNCS", + "_fit_coefficients", + "_segment_boundaries", + "_segment_index", +] diff --git a/src/finn/util/torch_hw_modules.py b/src/finn/util/torch_hw_modules.py new file mode 100644 index 0000000000..3fc560c182 --- /dev/null +++ b/src/finn/util/torch_hw_modules.py @@ -0,0 +1,236 @@ +# Copyright (C) 2026, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +""" +PyTorch modules that match FINN hardware-layer behavior. + +These modules are intended as drop-in PyTorch layers for modelling the +functional behavior of FINN hardware layers before conversion to HWCustomOps. +""" + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F + +# Constants matching the SystemVerilog pwpolyf module +NUM_OCTAVES = 5 +EXP_BIAS = 127 +EXP_BASE = 125 +EXP_CLAMP = 130 + +SUPPORTED_FUNCS = ("gelu", "silu", "sigmoid", "tanh") + +REFERENCE_FUNCS = { + "gelu": lambda x: F.gelu(x), + "silu": lambda x: F.silu(x), + "sigmoid": lambda x: torch.sigmoid(x), + "tanh": lambda x: torch.tanh(x), +} + +CLAMP_CFG = { + "gelu": {"neg_clamp": 0.0, "pos_clamp": 0.0, "pos_passthrough": True}, + "silu": {"neg_clamp": 0.0, "pos_clamp": 0.0, "pos_passthrough": True}, + "sigmoid": {"neg_clamp": 0.0, "pos_clamp": 1.0, "pos_passthrough": False}, + "tanh": {"neg_clamp": -1.0, "pos_clamp": 1.0, "pos_passthrough": False}, +} + + +def _segment_boundaries(K): + """Return (lo, hi) bounds for every PWPolyF segment.""" + num_subs = 1 << K + bounds = [] + + # Segment 0: near-zero + bounds.append((-0.25, 0.25)) + + # Positive segments + for octave in range(NUM_OCTAVES): + exp_val = EXP_BASE + octave - EXP_BIAS + base = 2.0**exp_val + for sub in range(num_subs): + lo = base * (1.0 + sub / num_subs) + hi = base * (1.0 + (sub + 1) / num_subs) + bounds.append((lo, hi)) + + # Negative segments (mirror of positive) + for octave in range(NUM_OCTAVES): + exp_val = EXP_BASE + octave - EXP_BIAS + base = 2.0**exp_val + for sub in range(num_subs): + lo = base * (1.0 + sub / num_subs) + hi = base * (1.0 + (sub + 1) / num_subs) + bounds.append((-hi, -lo)) + + return bounds + + +def _fit_coefficients(func_name, K, degree=2, num_samples=1000): + """Fit degree-N polynomials per segment. Returns a (segments, degree+1) tensor.""" + ref_fn = REFERENCE_FUNCS[func_name] + bounds = _segment_boundaries(K) + num_segs = len(bounds) + coeffs = np.zeros((num_segs, degree + 1), dtype=np.float64) + + for seg, (lo, hi) in enumerate(bounds): + xs = np.linspace(lo, hi, num_samples, dtype=np.float64) + with torch.no_grad(): + ys = ref_fn(torch.from_numpy(xs).float()).numpy().astype(np.float64) + c = np.polynomial.polynomial.polyfit(xs, ys, deg=degree) + coeffs[seg] = c[: degree + 1] + + return torch.from_numpy(coeffs.astype(np.float32)) + + +def _segment_index(x, K, num_subs, num_segs): + """Map each element to its polynomial segment, mirroring SV addressing.""" + abs_x = x.abs() + is_neg = x < 0 + + is_near_zero = abs_x < 0.25 + is_clamp = abs_x >= 8.0 + is_neg_clamp = is_neg & is_clamp + is_pos_clamp = (~is_neg) & is_clamp + + safe_abs = abs_x.clamp(min=0.25) + floor_log2 = torch.floor(torch.log2(safe_abs)) + octave = (floor_log2 + 2).long().clamp(0, NUM_OCTAVES - 1) + + pow2 = torch.exp2(floor_log2) + frac = safe_abs / pow2 - 1.0 + sub = (frac * num_subs).long().clamp(0, num_subs - 1) + + pos_idx = 1 + octave * num_subs + sub + neg_idx = 1 + NUM_OCTAVES * num_subs + octave * num_subs + sub + + seg_idx = torch.where( + is_near_zero, + torch.zeros_like(pos_idx), + torch.where(is_neg, neg_idx, pos_idx), + ) + seg_idx = seg_idx.clamp(0, num_segs - 1) + + return seg_idx, is_neg_clamp, is_pos_clamp + + +class PWPolyFFunction(torch.autograd.Function): + """Emit a single PWPolyF ONNX node during legacy torch.onnx export.""" + + @staticmethod + def forward(ctx, x, coeffs, neg_clamp_val, pos_clamp_val, func, K): + num_subs = 1 << K + num_segs = 1 + 2 * NUM_OCTAVES * num_subs + degree = coeffs.shape[1] - 1 + pos_passthrough = CLAMP_CFG[func]["pos_passthrough"] + + orig_shape = x.shape + x_flat = x.contiguous().view(-1) + + seg_idx, is_neg_clamp, is_pos_clamp = _segment_index(x_flat, K, num_subs, num_segs) + + c = coeffs[seg_idx] + # Horner evaluation: y = c0 + x*(c1 + x*(c2 + ...)) + y = c[:, degree] + for i in range(degree - 1, -1, -1): + y = c[:, i] + x_flat * y + + if pos_passthrough: + pos_val = x_flat + else: + pos_val = pos_clamp_val.expand_as(y) + y = torch.where(is_pos_clamp, pos_val, y) + y = torch.where(is_neg_clamp, neg_clamp_val.expand_as(y), y) + + return y.view(orig_shape) + + @staticmethod + def symbolic(g, x, coeffs, neg_clamp_val, pos_clamp_val, func, K): + return g.op("PWPolyF", x, func_s=func, K_i=K) + + +class PiecewisePolyActivation(nn.Module): + """ + Drop-in activation matching FINN's PWPolyF RTL behavior. + + Approximates nonlinear activations using piecewise polynomials over + segments defined by FP32 bit extraction. The polynomial is evaluated via + Horner's method to match the DSPFP32 FMA chain used by the RTL. + """ + + def __init__(self, func="gelu", K=3, degree=2, fit_samples=1000): + super().__init__() + if func not in SUPPORTED_FUNCS: + raise ValueError("Unsupported func=%r; choose from %s" % (func, SUPPORTED_FUNCS)) + + self.func = func + self.K = K + self.degree = degree + self.num_subs = 1 << K + self.num_segs = 1 + 2 * NUM_OCTAVES * self.num_subs + self.pos_passthrough = CLAMP_CFG[func]["pos_passthrough"] + + coeffs = _fit_coefficients(func, K, degree=degree, num_samples=fit_samples) + self.register_buffer("coeffs", coeffs) + + neg_cv = torch.tensor(CLAMP_CFG[func]["neg_clamp"], dtype=torch.float32) + pos_cv = torch.tensor(CLAMP_CFG[func]["pos_clamp"], dtype=torch.float32) + self.register_buffer("neg_clamp_val", neg_cv) + self.register_buffer("pos_clamp_val", pos_cv) + + def forward(self, x): + if torch.onnx.is_in_onnx_export(): + return PWPolyFFunction.apply( + x, + self.coeffs, + self.neg_clamp_val, + self.pos_clamp_val, + self.func, + self.K, + ) + + orig_shape = x.shape + x_flat = x.contiguous().view(-1) + + seg_idx, is_neg_clamp, is_pos_clamp = _segment_index( + x_flat, self.K, self.num_subs, self.num_segs + ) + + c = self.coeffs[seg_idx] + # Horner evaluation: y = c0 + x*(c1 + x*(c2 + ...)) + y = c[:, self.degree] + for i in range(self.degree - 1, -1, -1): + y = c[:, i] + x_flat * y + + if self.pos_passthrough: + pos_val = x_flat + else: + pos_val = self.pos_clamp_val.expand_as(y) + y = torch.where(is_pos_clamp, pos_val, y) + y = torch.where(is_neg_clamp, self.neg_clamp_val.expand_as(y), y) + + return y.view(orig_shape) diff --git a/tests/fpgadataflow/test_fpgadataflow_pwpolyf.py b/tests/fpgadataflow/test_fpgadataflow_pwpolyf.py index 4a1b656631..f5d03ca82d 100644 --- a/tests/fpgadataflow/test_fpgadataflow_pwpolyf.py +++ b/tests/fpgadataflow/test_fpgadataflow_pwpolyf.py @@ -50,7 +50,7 @@ from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode from finn.transformation.fpgadataflow.set_fifo_depths import InsertAndSetFIFODepths from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers -from finn.util.pwpolyf import PiecewisePolyActivation +from finn.util.torch_hw_modules import PiecewisePolyActivation test_fpga_part = "xcve2002-sbva484-2MP-e-S" non_versal_fpga_part = "xczu3eg-sbva484-1-e" From 59fc39870eed6afdf2b930d013f45e19bfbc23ab Mon Sep 17 00:00:00 2001 From: ollycassidy13 Date: Mon, 18 May 2026 16:34:17 +0100 Subject: [PATCH 11/12] Address PWPolyF reviewer comments --- docs/finn/components/index.rst | 1 + docs/finn/components/pwpolyf.rst | 272 ++++++++++++++++++ docs/finn/pwpolyf.md | 176 ------------ src/finn/custom_op/fpgadataflow/pwpolyf.py | 49 +--- .../custom_op/fpgadataflow/rtl/pwpolyf_rtl.py | 48 +--- src/finn/util/pwpolyf.py | 29 +- src/finn/util/torch_hw_modules.py | 29 +- .../fpgadataflow/test_fpgadataflow_pwpolyf.py | 48 +--- 8 files changed, 320 insertions(+), 332 deletions(-) create mode 100644 docs/finn/components/pwpolyf.rst delete mode 100644 docs/finn/pwpolyf.md diff --git a/docs/finn/components/index.rst b/docs/finn/components/index.rst index 9ab59297b1..7c8cdf1840 100644 --- a/docs/finn/components/index.rst +++ b/docs/finn/components/index.rst @@ -10,3 +10,4 @@ This section provides detailed documentation for specific FINN hardware componen :maxdepth: 2 rtl-swg + pwpolyf diff --git a/docs/finn/components/pwpolyf.rst b/docs/finn/components/pwpolyf.rst new file mode 100644 index 0000000000..0259f35450 --- /dev/null +++ b/docs/finn/components/pwpolyf.rst @@ -0,0 +1,272 @@ +PWPolyF Piecewise Polynomial Activation +======================================= + +Overview +-------- + +PWPolyF is a hardware activation layer that approximates nonlinear functions +(GELU, SiLU, Sigmoid, Tanh) using piecewise polynomials evaluated with +Horner's method on a chain of DSPFP32 FMA units. With the default degree of 2, +this uses two cascaded DSPs and one RAMB18 coefficient ROM per PE, giving +single-cycle-per-element throughput. Per-function configuration, including +clamping behaviour and polynomial coefficients, is delivered through a +SystemVerilog package (``pwpolyf_pkg``) using a ``func_cfg_t`` struct. + +The input domain is partitioned into ``1 + 2*5*(2^K)`` segments: one near-zero +region, positive octave sub-segments, and negative mirrors. With the default +``K=3`` this gives 81 segments. Segment selection reuses the FP32 exponent and +mantissa bit fields directly, matching the RTL implementation. + +Polynomial coefficients are generated at HDL build time by +``PWPolyF_rtl._generate_coeffs_pkg()``, which fits polynomials of the +configured degree to the reference PyTorch functions and writes +``pwpolyf_pkg.sv``. Both ``K`` and ``degree`` are configurable. They default to +``K=3`` and ``degree=2`` when inferred from standard ONNX ops. + +Architecture +------------ + +PWPolyF is RTL-only, with no HLS variant, and targets Versal devices only. The +RTL instantiates the Versal DSPFP32 primitive, so UltraScale+ and older parts +must not be specialized to this backend. + +Two export paths are supported: + +.. code-block:: text + + Path A: PiecewisePolyActivation Path B: nn.GELU / nn.SiLU / etc. + | torch.onnx.export | torch.onnx.export + | (dynamo=False) | (dynamo=True or False) + v v + PWPolyF custom ONNX node Standard ONNX ops (Gelu, Sigmoid, + | Tanh, Sigmoid+Mul for SiLU, + | Div+Erf+Add+Mul+Mul for GELU) + | | + +------------- both paths -------------+ + | + InferPWPolyFLayer + v + PWPolyF HW op (finn.custom_op.fpgadataflow) + | SpecializeLayers + v + PWPolyF_rtl (finn.custom_op.fpgadataflow.rtl) + | generate_hdl + v + finn-rtllib/pwpolyf/hdl/ SystemVerilog IP + +Standard ONNX Op Inference +-------------------------- + +``InferPWPolyFLayer`` recognises standard ONNX activation ops in addition to +the explicit ``PWPolyF`` custom op. This allows models that use ``nn.GELU``, +``nn.SiLU``, ``nn.Sigmoid``, or ``nn.Tanh`` to be exported with ``dynamo=True`` +or ``dynamo=False`` and automatically converted to PWPolyF HW layers. + +.. list-table:: + :header-rows: 1 + :widths: 20 45 20 + + * - ONNX op type + - Pattern + - Maps to + * - ``Gelu`` (opset 20+) + - Single node + - ``func="gelu"`` + * - ``Div`` + ``Erf`` + ``Add`` + ``Mul`` + ``Mul`` + - ``x * 0.5 * (1 + erf(x / sqrt(2)))`` + - ``func="gelu"`` + * - ``Sigmoid`` + - Single node (standalone) + - ``func="sigmoid"`` + * - ``Tanh`` + - Single node + - ``func="tanh"`` + * - ``Sigmoid`` + ``Mul`` + - ``Mul(x, Sigmoid(x))`` + - ``func="silu"`` + +``Gelu`` as a single ONNX node requires opset 20 or later. With lower opsets, +including ``dynamo=True`` export defaults to opset 18, GELU decomposes into a +5-node Erf-based pattern. Both forms are matched. SiLU has no standard ONNX op +and decomposes to ``Sigmoid(x) * x``. Only FLOAT32 inputs are converted. + +Folding +------- + +PWPolyF uses PE parallelism. ``NumChannels % PE == 0`` must hold. Each PE +instantiates its own polynomial evaluation pipeline with ``degree`` DSPs. +``SetFolding`` handles PE selection automatically. + +.. list-table:: + :header-rows: 1 + :widths: 10 10 15 15 15 25 + + * - PE + - Degree + - DSPs + - BRAM18s + - Approx LUTs + - Cycles per spatial position + * - 1 + - 2 + - 2 + - 1 + - 200 + - NumChannels + * - C + - 2 + - 2C + - C + - 200C + - 1 + * - 1 + - 3 + - 3 + - 2 + - 300 + - NumChannels + +Resource Estimates +------------------ + +* DSP: ``degree * PE`` (one FP32 FMA stage per polynomial degree per PE) +* LUT: approximately ``100 * degree * PE`` for segment address decode and + control +* BRAM18: ``(degree - 1) * PE`` for default ``K=3``. Vivado infers delayed + coefficient lookups as 32-bit ROMs. +* URAM: 0 + +ONNX Export +----------- + +Two export paths are supported: + +* ``PiecewisePolyActivation`` exports as a single ``PWPolyF`` custom op via + ``torch.autograd.Function.symbolic()``. It requires ``dynamo=False`` and + preserves the ``K`` attribute on the ONNX node. +* Standard PyTorch modules (``nn.GELU``, ``nn.SiLU``, ``nn.Sigmoid``, + ``nn.Tanh``) export with ``dynamo=True`` or ``dynamo=False`` and produce + standard ONNX ops that ``InferPWPolyFLayer`` converts to PWPolyF with + default ``K=3``. + +Attributes on the explicit PWPolyF ONNX node are: + +* ``func``: one of ``gelu``, ``silu``, ``sigmoid``, ``tanh`` +* ``K``: mantissa subdivision bits, default 3 + +Node Attributes +--------------- + +.. list-table:: + :header-rows: 1 + :widths: 25 15 45 + + * - Attribute + - Type + - Description + * - ``func`` + - string + - Activation function name + * - ``K`` + - int + - Mantissa subdivision bits, default 3 + * - ``degree`` + - int + - Polynomial degree / FMA stages, default 2 + * - ``NumChannels`` + - int + - Number of channels in the last input dimension + * - ``PE`` + - int + - Processing elements + * - ``inputDataType`` + - string + - Input data type, always FLOAT32 + * - ``outputDataType`` + - string + - Output data type, always FLOAT32 + * - ``numInputVectors`` + - ints + - Batch/spatial dimensions + +Supported Functions +------------------- + +.. list-table:: + :header-rows: 1 + :widths: 20 20 30 + + * - Function + - Negative clamp + - Positive behaviour + * - GELU + - 0.0 + - passthrough (``y=x``) + * - SiLU + - 0.0 + - passthrough (``y=x``) + * - Sigmoid + - 0.0 + - clamp to 1.0 + * - Tanh + - -1.0 + - clamp to 1.0 + +Files +----- + +Python files: + +.. list-table:: + :header-rows: 1 + :widths: 35 50 + + * - File + - Purpose + * - ``util/torch_hw_modules.py`` + - PyTorch activation module, ONNX export, software simulation + * - ``custom_op/fpgadataflow/pwpolyf.py`` + - Base HW op for shape, folding, resource estimates, cppsim + * - ``custom_op/fpgadataflow/rtl/pwpolyf_rtl.py`` + - RTL backend for HDL generation, package generation, rtlsim, IPI + * - ``util/pwpolyf.py`` + - Compatibility imports for existing PWPolyF utility users + * - ``transformation/fpgadataflow/convert_to_hw_layers.py`` + - ``InferPWPolyFLayer`` transformation + * - ``builder/build_dataflow_steps.py`` + - Build pipeline integration + * - ``transformation/fpgadataflow/set_folding.py`` + - Folding support + +RTL files: + +.. list-table:: + :header-rows: 1 + :widths: 35 50 + + * - File + - Purpose + * - ``finn-rtllib/pwpolyf/hdl/pwpolyf_pkg.sv`` + - ``func_cfg_t`` struct per activation, regenerated per K + * - ``finn-rtllib/pwpolyf/hdl/pwpolyf.sv`` + - Polynomial evaluation pipeline using a Horner chain on DSPFP32 + * - ``finn-rtllib/pwpolyf/hdl/queue.sv`` + - Elastic FIFO for backpressure + * - ``finn-rtllib/pwpolyf/hdl/pwpolyf_template_wrapper.v`` + - AXI-Stream wrapper template + +Tests +----- + +``tests/fpgadataflow/test_fpgadataflow_pwpolyf.py`` covers: + +* cppsim for all supported functions, channel counts, spatial shapes, and + foldings +* ONNX export for the explicit ``PiecewisePolyActivation`` path +* ``InferPWPolyFLayer`` conversion and execution +* standard op inference for Gelu, Sigmoid, Tanh, SiLU, and Erf-based GELU +* execution correctness against ``PiecewisePolyActivation`` +* Versal-only specialization checks +* resource estimates, folded shapes, and expected cycles +* coefficient package generation for ``K`` and ``degree`` +* Vivado HDL generation, RTL simulation, and stitched IP simulation diff --git a/docs/finn/pwpolyf.md b/docs/finn/pwpolyf.md deleted file mode 100644 index cf3aaeeeb5..0000000000 --- a/docs/finn/pwpolyf.md +++ /dev/null @@ -1,176 +0,0 @@ -# PWPolyF — Piecewise Polynomial Activation - -## Overview - -PWPolyF is a hardware activation layer that approximates nonlinear functions -(GELU, SiLU, Sigmoid, Tanh) using piecewise polynomials evaluated via Horner's -method on a chain of DSPFP32 FMA units. With the default degree 2, this uses -two cascaded DSPs and one RAMB18 coefficient ROM per PE, giving -single-cycle-per-element throughput. Per-function configuration (clamping -behaviour and polynomial coefficients) is delivered through a SystemVerilog -package (`pwpolyf_pkg`) using a `func_cfg_t` struct. - -The input domain is partitioned into `1 + 2*5*(2^K)` segments: one near-zero -region, positive octave sub-segments, and negative mirrors. With the default -K=3 this gives 81 segments. Segment selection reuses the FP32 -exponent/mantissa bit-fields directly, matching the RTL implementation. - -Polynomial coefficients are generated at HDL build time by -`generate_coeffs_pkg()` in `pwpolyf_rtl.py`, which fits polynomials of the -configured degree to the reference PyTorch functions and writes -`pwpolyf_pkg.sv` — a SystemVerilog package with one `func_cfg_t` struct per -activation (clamping config + coefficient table). Both K and degree are -configurable; they default to K=3 and degree=2 when inferred from standard -ONNX ops. - -## Architecture - -PWPolyF is **RTL-only** (no HLS variant) and targets Versal devices only, -since the RTL instantiates the Versal DSPFP32 primitive. Two export paths are -supported: - -``` -Path A: PiecewisePolyActivation Path B: nn.GELU / nn.SiLU / etc. - | torch.onnx.export | torch.onnx.export - | (dynamo=False) | (dynamo=True or False) - v v -PWPolyF custom ONNX node Standard ONNX ops (Gelu, Sigmoid, - | Tanh, Sigmoid+Mul for SiLU, - | Div+Erf+Add+Mul+Mul for GELU) - | | - +------------- both paths -------------+ - | - InferPWPolyFLayer - v - PWPolyF HW op (finn.custom_op.fpgadataflow) - | SpecializeLayers - v - PWPolyF_rtl (finn.custom_op.fpgadataflow.rtl) - | generate_hdl - v - finn-rtllib/pwpolyf/hdl/ SystemVerilog IP -``` - -### Standard ONNX op inference - -`InferPWPolyFLayer` recognises standard ONNX activation ops in addition to -the explicit `PWPolyF` custom op. This allows models that use `nn.GELU`, -`nn.SiLU`, `nn.Sigmoid`, or `nn.Tanh` to be exported with `dynamo=True` -(or `dynamo=False`) and automatically converted to PWPolyF HW layers. - -| ONNX op type | Pattern | Maps to | -|---|---|---| -| `Gelu` (opset 20+) | Single node | `func="gelu"` | -| `Div`+`Erf`+`Add`+`Mul`+`Mul` | `x * 0.5 * (1 + erf(x / sqrt(2)))` | `func="gelu"` | -| `Sigmoid` | Single node (standalone) | `func="sigmoid"` | -| `Tanh` | Single node | `func="tanh"` | -| `Sigmoid` + `Mul` | `Mul(x, Sigmoid(x))` | `func="silu"` | - -Notes: -- `Gelu` as a single ONNX node requires opset 20 or later. With lower - opsets (including `dynamo=True` which defaults to opset 18), GELU - decomposes into a 5-node Erf-based pattern. Both forms are matched. -- SiLU (`nn.SiLU`) has no standard ONNX op; it decomposes to - `Sigmoid(x) * x`. The transformation detects this two-node pattern. -- Only FLOAT32 inputs are converted. Quantised activations are skipped. - -## Folding - -PWPolyF uses PE parallelism. `NumChannels % PE == 0` must hold. -Each PE instantiates its own polynomial evaluation pipeline (`degree` DSPs). -`SetFolding` handles PE selection automatically. - -| PE | Degree | DSPs | BRAM18s | Approx LUTs | Cycles (per spatial position) | -|----|--------|------------|-------------------|------------------|-------------------------------| -| 1 | 2 | 2 | 1 | 200 | NumChannels | -| C | 2 | 2C | C | 200C | 1 | -| 1 | 3 | 3 | 2 | 300 | NumChannels | - -## Resource estimates - -- **DSP:** `degree * PE` (one FP32 FMA stage per polynomial degree per PE) -- **LUT:** `~100 * degree * PE` (segment address decode + control) -- **BRAM18:** `(degree - 1) * PE` for default `K=3` (Vivado infers delayed - coefficient lookups as 32-bit ROMs) -- **URAM:** 0 - -## ONNX export - -Two export paths are supported: - -1. **`PiecewisePolyActivation` (explicit)** — exports as a single `PWPolyF` - custom op via `torch.autograd.Function.symbolic()`. Requires - `dynamo=False`. Preserves the `K` attribute on the ONNX node. - -2. **Standard nn modules** (`nn.GELU`, `nn.SiLU`, `nn.Sigmoid`, `nn.Tanh`) — - export with `dynamo=True` or `dynamo=False`. Produces standard ONNX ops - that `InferPWPolyFLayer` converts to PWPolyF with default `K=3`. - -Attributes on the explicit PWPolyF ONNX node: -- `func` (string): one of `gelu`, `silu`, `sigmoid`, `tanh` -- `K` (int): mantissa subdivision bits (default 3) - -## Node attributes (HW op) - -| Attribute | Type | Description | -|--------------------|--------|------------------------------------------| -| `func` | string | Activation function name | -| `K` | int | Mantissa subdivision bits (default 3) | -| `degree` | int | Polynomial degree / FMA stages (default 2) | -| `NumChannels` | int | Number of channels (last input dim) | -| `PE` | int | Processing elements | -| `inputDataType` | string | Input data type (FLOAT32) | -| `outputDataType` | string | Output data type (FLOAT32) | -| `numInputVectors` | ints | Batch/spatial dimensions | - -## Supported functions - -| Function | Negative clamp | Positive behaviour | -|----------|---------------|--------------------| -| GELU | 0.0 | passthrough (y=x) | -| SiLU | 0.0 | passthrough (y=x) | -| Sigmoid | 0.0 | clamp to 1.0 | -| Tanh | -1.0 | clamp to 1.0 | - -## Files - -### Python - -| File | Purpose | -|------|---------| -| `util/torch_hw_modules.py` | PyTorch activation module, ONNX export, software simulation | -| `custom_op/fpgadataflow/pwpolyf.py` | Base HW op (shape, folding, resource estimates, cppsim) | -| `custom_op/fpgadataflow/rtl/pwpolyf_rtl.py` | RTL backend (HDL generation, package generation, rtlsim, IPI) | -| `util/pwpolyf.py` | Compatibility imports for existing PWPolyF utility users | -| `transformation/fpgadataflow/convert_to_hw_layers.py` | `InferPWPolyFLayer` transformation | -| `builder/build_dataflow_steps.py` | Build pipeline integration | -| `transformation/fpgadataflow/set_folding.py` | Folding support (pe_ops list) | - -### RTL - -| File | Purpose | -|------|---------| -| `finn-rtllib/pwpolyf/hdl/pwpolyf_pkg.sv` | `func_cfg_t` struct per activation (coeffs + clamp config, regenerated per K) | -| `finn-rtllib/pwpolyf/hdl/pwpolyf.sv` | Polynomial evaluation pipeline (Horner chain on DSPFP32) | -| `finn-rtllib/pwpolyf/hdl/queue.sv` | Elastic FIFO for backpressure | -| `finn-rtllib/pwpolyf/hdl/pwpolyf_template_wrapper.v` | AXI-Stream wrapper template | - -## Tests - -`tests/fpgadataflow/test_fpgadataflow_pwpolyf.py`: - -- **cppsim**: all 4 functions x 2 channel counts x 2 spatial shapes x 3 foldings -- **ONNX export**: verifies single-node export for all functions -- **InferPWPolyFLayer**: end-to-end export → transform → execute -- **Standard op inference**: Gelu/Sigmoid/Tanh single-node + SiLU pattern -- **Erf-based GELU inference**: 5-node Erf decomposition pattern matching + execution -- **SiLU edge cases**: reversed Mul input order, multi-consumer Sigmoid -- **Execution correctness**: standard ops produce same output as PiecewisePolyActivation -- **SpecializeLayers**: verifies RTL specialization -- **Resource estimates**: DSP/LUT/BRAM checks across PE and degree values -- **Folded shapes**: input/output/stream width calculations -- **Expected cycles**: cycle count estimation + analysis pass integration -- **Coefficient package**: `generate_coeffs_pkg()` output validation for K and degree -- **HDL generation** (Vivado): verifies `generate_hdl` produces correct files and package content -- **RTL simulation** (Vivado, slow): node-by-node rtlsim with cycle count verification -- **Stitched IP** (Vivado, slow): end-to-end stitched IP rtlsim diff --git a/src/finn/custom_op/fpgadataflow/pwpolyf.py b/src/finn/custom_op/fpgadataflow/pwpolyf.py index 746ebdeb38..b7a683499b 100644 --- a/src/finn/custom_op/fpgadataflow/pwpolyf.py +++ b/src/finn/custom_op/fpgadataflow/pwpolyf.py @@ -1,30 +1,5 @@ -# Copyright (C) 2026, Advanced Micro Devices, Inc. -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# * Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# * Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# * Neither the name of FINN nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# Copyright Advanced Micro Devices, Inc. +# SPDX-License-Identifier: BSD-3-Clause import math import numpy as np @@ -32,7 +7,8 @@ from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp -# Piecewise polynomial constants matching the RTL module +# NUM_OCTAVES is fixed by the RTL segment decode and clamp range. K controls +# the number of mantissa subdivisions inside each of these fixed octaves. _NUM_OCTAVES = 5 _SUPPORTED_FUNCS = {"gelu", "silu", "sigmoid", "tanh"} @@ -82,10 +58,13 @@ def make_shape_compatible_op(self, model): def infer_node_datatype(self, model): node = self.onnx_node idt = model.get_tensor_datatype(node.input[0]) - if idt != self.get_input_datatype(): - self.set_nodeattr("inputDataType", idt.name) - odt = self.get_output_datatype() - model.set_tensor_datatype(node.output[0], odt) + assert idt == DataType["FLOAT32"], "%s: PWPolyF requires FLOAT32 input, got %s" % ( + node.name, + idt, + ) + self.set_nodeattr("inputDataType", idt.name) + self.set_nodeattr("outputDataType", idt.name) + model.set_tensor_datatype(node.output[0], idt) def verify_node(self): info_messages = [] @@ -114,6 +93,9 @@ def verify_node(self): idt = self.get_nodeattr("inputDataType") if idt != "FLOAT32": info_messages.append("PWPolyF requires FLOAT32 input, got %s" % idt) + odt = self.get_nodeattr("outputDataType") + if odt != "FLOAT32": + info_messages.append("PWPolyF requires FLOAT32 output, got %s" % odt) return info_messages @@ -149,9 +131,6 @@ def get_normal_input_shape(self, ind=0): def get_normal_output_shape(self, ind=0): return self.get_normal_input_shape() - def get_number_output_values(self): - return np.prod(self.get_folded_output_shape()[:-1]) - def get_exp_cycles(self): # II=1, latency amortised over stream length return np.prod(self.get_folded_output_shape()[:-1]) diff --git a/src/finn/custom_op/fpgadataflow/rtl/pwpolyf_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/pwpolyf_rtl.py index 3411c81a8b..5dfa730bc1 100644 --- a/src/finn/custom_op/fpgadataflow/rtl/pwpolyf_rtl.py +++ b/src/finn/custom_op/fpgadataflow/rtl/pwpolyf_rtl.py @@ -1,37 +1,14 @@ -# Copyright (C) 2026, Advanced Micro Devices, Inc. -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# * Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# * Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# * Neither the name of FINN nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# Copyright Advanced Micro Devices, Inc. +# SPDX-License-Identifier: BSD-3-Clause +import numpy as np import os import shutil -import struct +from qonnx.core.datatype import DataType from finn.custom_op.fpgadataflow.pwpolyf import PWPolyF from finn.custom_op.fpgadataflow.rtlbackend import RTLBackend +from finn.util.data_packing import array2hexstring from finn.util.torch_hw_modules import ( CLAMP_CFG, NUM_OCTAVES, @@ -42,10 +19,10 @@ def _float_to_hex(f): """Convert a Python float to a 32-bit IEEE 754 hex string.""" - return "%08X" % struct.unpack("!I", struct.pack("!f", float(f)))[0] + return array2hexstring(np.array([f]), DataType["FLOAT32"], 32, prefix="").upper() -def generate_coeffs_pkg(K, degree=2, num_samples=1000): +def _generate_coeffs_pkg_data(K, degree=2, num_samples=1000): """Generate the pwpolyf_pkg.sv package content for a given K value. Produces a SystemVerilog package with a func_cfg_t struct per activation @@ -55,7 +32,7 @@ def generate_coeffs_pkg(K, degree=2, num_samples=1000): num_segs = 1 + 2 * NUM_OCTAVES * num_subs lines = [] - lines.append("// Auto-generated by pwpolyf_rtl.py — do not edit manually.") + lines.append("// Auto-generated by pwpolyf_rtl.py - do not edit manually.") lines.append( "// DEGREE=%d K=%d NUM_OCTAVES=%d Segments: %d" % (degree, K, NUM_OCTAVES, num_segs) ) @@ -115,6 +92,11 @@ def get_nodeattr_types(self): my_attrs.update(RTLBackend.get_nodeattr_types(self)) return my_attrs + def _generate_coeffs_pkg(self, num_samples=1000): + K = self.get_nodeattr("K") + degree = self.get_nodeattr("degree") + return _generate_coeffs_pkg_data(K, degree=degree, num_samples=num_samples) + def generate_hdl(self, model, fpgapart, clk): rtllib_dir = os.path.join(os.environ["FINN_ROOT"], "finn-rtllib/pwpolyf/hdl/") template_path = rtllib_dir + "pwpolyf_template_wrapper.v" @@ -148,9 +130,7 @@ def generate_hdl(self, model, fpgapart, clk): shutil.copy(rtllib_dir + sv_file, code_gen_dir) # generate package with coefficients matching the node's K and degree - K = self.get_nodeattr("K") - degree = self.get_nodeattr("degree") - pkg_data = generate_coeffs_pkg(K, degree=degree) + pkg_data = self._generate_coeffs_pkg() with open(os.path.join(code_gen_dir, "pwpolyf_pkg.sv"), "w") as f: f.write(pkg_data) diff --git a/src/finn/util/pwpolyf.py b/src/finn/util/pwpolyf.py index 1972b0248a..0c426db05b 100644 --- a/src/finn/util/pwpolyf.py +++ b/src/finn/util/pwpolyf.py @@ -1,30 +1,5 @@ -# Copyright (C) 2026, Advanced Micro Devices, Inc. -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# * Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# * Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# * Neither the name of FINN nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# Copyright Advanced Micro Devices, Inc. +# SPDX-License-Identifier: BSD-3-Clause """Compatibility imports for PWPolyF PyTorch utilities. diff --git a/src/finn/util/torch_hw_modules.py b/src/finn/util/torch_hw_modules.py index 3fc560c182..b12ed05809 100644 --- a/src/finn/util/torch_hw_modules.py +++ b/src/finn/util/torch_hw_modules.py @@ -1,30 +1,5 @@ -# Copyright (C) 2026, Advanced Micro Devices, Inc. -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# * Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# * Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# * Neither the name of FINN nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# Copyright Advanced Micro Devices, Inc. +# SPDX-License-Identifier: BSD-3-Clause """ PyTorch modules that match FINN hardware-layer behavior. diff --git a/tests/fpgadataflow/test_fpgadataflow_pwpolyf.py b/tests/fpgadataflow/test_fpgadataflow_pwpolyf.py index f5d03ca82d..4942b6e7ed 100644 --- a/tests/fpgadataflow/test_fpgadataflow_pwpolyf.py +++ b/tests/fpgadataflow/test_fpgadataflow_pwpolyf.py @@ -1,30 +1,5 @@ -# Copyright (C) 2026, Advanced Micro Devices, Inc. -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# * Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# * Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# * Neither the name of FINN nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# Copyright Advanced Micro Devices, Inc. +# SPDX-License-Identifier: BSD-3-Clause import pytest @@ -41,7 +16,6 @@ import finn.core.onnx_exec as oxe from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer -from finn.custom_op.fpgadataflow.rtl.pwpolyf_rtl import generate_coeffs_pkg from finn.transformation.fpgadataflow.convert_to_hw_layers import InferPWPolyFLayer from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP @@ -52,7 +26,7 @@ from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers from finn.util.torch_hw_modules import PiecewisePolyActivation -test_fpga_part = "xcve2002-sbva484-2MP-e-S" +test_fpga_part = "xcvc1902-vsva2197-2MP-e-S" non_versal_fpga_part = "xczu3eg-sbva484-1-e" target_clk_ns = 5 @@ -91,6 +65,14 @@ def make_pwpolyf_modelwrapper(func, K, num_channels, num_input_vecs): return model +def make_pwpolyf_rtl_inst(K=3, degree=2): + model = make_pwpolyf_modelwrapper("gelu", K, 4, [1]) + model = model.transform(SpecializeLayers(test_fpga_part)) + inst = getCustomOp(model.graph.node[0]) + inst.set_nodeattr("degree", degree) + return inst + + @pytest.mark.parametrize("func", ["gelu", "silu", "sigmoid", "tanh"]) @pytest.mark.parametrize("num_channels", [4, 16]) @pytest.mark.parametrize("num_input_vecs", [[1], [1, 2, 2]]) @@ -590,8 +572,8 @@ def test_pwpolyf_erf_gelu_execution(): @pytest.mark.parametrize("K", [2, 3, 4]) @pytest.mark.fpgadataflow def test_pwpolyf_generate_coeffs_pkg(K): - """Verify generate_coeffs_pkg produces valid SystemVerilog package.""" - pkg = generate_coeffs_pkg(K) + """Verify PWPolyF_rtl coefficient generation produces valid SystemVerilog.""" + pkg = make_pwpolyf_rtl_inst(K=K)._generate_coeffs_pkg() assert "package pwpolyf_pkg" in pkg assert "endpackage" in pkg @@ -613,9 +595,9 @@ def test_pwpolyf_generate_coeffs_pkg(K): @pytest.mark.parametrize("degree", [1, 2, 3]) @pytest.mark.fpgadataflow def test_pwpolyf_generate_coeffs_pkg_degree(degree): - """Verify generate_coeffs_pkg respects degree parameter.""" + """Verify PWPolyF_rtl coefficient generation respects degree parameter.""" K = 3 - pkg = generate_coeffs_pkg(K, degree=degree) + pkg = make_pwpolyf_rtl_inst(K=K, degree=degree)._generate_coeffs_pkg() assert "DEGREE = %d;" % degree in pkg # Each segment line should have degree+1 coefficient values From f3156c4edfd84e37b6680cb2b30e71a174874585 Mon Sep 17 00:00:00 2001 From: ollycassidy13 Date: Wed, 20 May 2026 12:28:58 +0100 Subject: [PATCH 12/12] export to match brevitas --- .../fpgadataflow/convert_to_hw_layers.py | 3 +++ src/finn/util/torch_hw_modules.py | 9 +++++---- tests/fpgadataflow/test_fpgadataflow_pwpolyf.py | 10 +++++++--- 3 files changed, 15 insertions(+), 7 deletions(-) diff --git a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py index dc09b3daeb..185dc73e06 100644 --- a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py +++ b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py @@ -353,6 +353,8 @@ def apply(self, model): func = get_by_name(node.attribute, "func").s.decode("utf-8") K_attr = get_by_name(node.attribute, "K") K = K_attr.i if K_attr is not None else 3 + degree_attr = get_by_name(node.attribute, "degree") + degree = degree_attr.i if degree_attr is not None else 2 new_node = self._make_pwpolyf_node( pwp_input, @@ -362,6 +364,7 @@ def apply(self, model): idt, "PWPolyF_" + node.name, K, + degree, ) graph.node.insert(node_ind, new_node) graph.node.remove(node) diff --git a/src/finn/util/torch_hw_modules.py b/src/finn/util/torch_hw_modules.py index b12ed05809..d73ae16f0c 100644 --- a/src/finn/util/torch_hw_modules.py +++ b/src/finn/util/torch_hw_modules.py @@ -117,10 +117,10 @@ class PWPolyFFunction(torch.autograd.Function): """Emit a single PWPolyF ONNX node during legacy torch.onnx export.""" @staticmethod - def forward(ctx, x, coeffs, neg_clamp_val, pos_clamp_val, func, K): + def forward(ctx, x, coeffs, neg_clamp_val, pos_clamp_val, func, K, degree): num_subs = 1 << K num_segs = 1 + 2 * NUM_OCTAVES * num_subs - degree = coeffs.shape[1] - 1 + degree = int(degree) pos_passthrough = CLAMP_CFG[func]["pos_passthrough"] orig_shape = x.shape @@ -144,8 +144,8 @@ def forward(ctx, x, coeffs, neg_clamp_val, pos_clamp_val, func, K): return y.view(orig_shape) @staticmethod - def symbolic(g, x, coeffs, neg_clamp_val, pos_clamp_val, func, K): - return g.op("PWPolyF", x, func_s=func, K_i=K) + def symbolic(g, x, coeffs, neg_clamp_val, pos_clamp_val, func, K, degree): + return g.op("PWPolyF", x, func_s=func, K_i=K, degree_i=degree) class PiecewisePolyActivation(nn.Module): @@ -186,6 +186,7 @@ def forward(self, x): self.pos_clamp_val, self.func, self.K, + self.degree, ) orig_shape = x.shape diff --git a/tests/fpgadataflow/test_fpgadataflow_pwpolyf.py b/tests/fpgadataflow/test_fpgadataflow_pwpolyf.py index 4942b6e7ed..b9de975778 100644 --- a/tests/fpgadataflow/test_fpgadataflow_pwpolyf.py +++ b/tests/fpgadataflow/test_fpgadataflow_pwpolyf.py @@ -109,8 +109,9 @@ def test_pwpolyf_cppsim(func, num_channels, num_input_vecs, fold): @pytest.mark.fpgadataflow def test_pwpolyf_onnx_export(func): K = 3 + degree = 3 num_channels = 32 - mod = PiecewisePolyActivation(func, K=K) + mod = PiecewisePolyActivation(func, K=K, degree=degree) mod.eval() dummy = torch.randn(1, num_channels) @@ -138,14 +139,16 @@ def test_pwpolyf_onnx_export(func): func_attr = {a.name: a for a in node.attribute} assert func_attr["func"].s.decode("utf-8") == func assert func_attr["K"].i == K + assert func_attr["degree"].i == degree @pytest.mark.parametrize("func", ["gelu", "sigmoid"]) @pytest.mark.fpgadataflow def test_pwpolyf_infer_transform(func): K = 3 + degree = 3 num_channels = 16 - mod = PiecewisePolyActivation(func, K=K) + mod = PiecewisePolyActivation(func, K=K, degree=degree) mod.eval() dummy = torch.randn(1, num_channels) @@ -178,6 +181,7 @@ def test_pwpolyf_infer_transform(func): inst = getCustomOp(node) assert inst.get_nodeattr("func") == func assert inst.get_nodeattr("K") == K + assert inst.get_nodeattr("degree") == degree assert inst.get_nodeattr("NumChannels") == num_channels assert inst.get_nodeattr("PE") == 1 assert inst.get_nodeattr("inputDataType") == "FLOAT32" @@ -186,7 +190,7 @@ def test_pwpolyf_infer_transform(func): input_dict = {"inp": x} y_produced = oxe.execute_onnx(model, input_dict)["outp"] - ref_mod = PiecewisePolyActivation(func, K=K) + ref_mod = PiecewisePolyActivation(func, K=K, degree=degree) with torch.no_grad(): y_expected = ref_mod(torch.from_numpy(x)).numpy() assert np.allclose(y_produced, y_expected, atol=1e-6)