From 1d1399046d0cf61b00082352e75f814d0ff15cf3 Mon Sep 17 00:00:00 2001
From: ollycassidy13 <ollyj.cassidy@gmail.com>
Date: Wed, 22 Apr 2026 11:43:10 +0100
Subject: [PATCH 01/12] pwpolyf initial integration (missing dynamo and nn.act)
 (hw stub)

---
 docs/finn/internals.rst                       |   3 +
 docs/finn/pwpolyf.md                          | 123 ++++++
 .../finn.custom_op.fpgadataflow.rst           |   9 +
 .../finn.custom_op.fpgadataflow.rtl.rst       |   8 +
 docs/finn/source_code/finn.util.rst           |   8 +
 finn-rtllib/pwpolyf/hdl/pwpolyf.sv            | 356 ++++++++++++++++++
 finn-rtllib/pwpolyf/hdl/pwpolyf_coeffs.svh    | 344 +++++++++++++++++
 .../pwpolyf/hdl/pwpolyf_template_wrapper.v    |  69 ++++
 finn-rtllib/pwpolyf/hdl/queue.sv              |  78 ++++
 src/finn/builder/build_dataflow_steps.py      |   2 +
 src/finn/custom_op/fpgadataflow/__init__.py   |   2 +
 src/finn/custom_op/fpgadataflow/pwpolyf.py    | 187 +++++++++
 .../custom_op/fpgadataflow/rtl/__init__.py    |   2 +
 .../custom_op/fpgadataflow/rtl/pwpolyf_rtl.py | 289 ++++++++++++++
 .../fpgadataflow/convert_to_hw_layers.py      |  47 +++
 .../fpgadataflow/set_folding.py               |   1 +
 src/finn/util/pwpolyf.py                      | 236 ++++++++++++
 .../fpgadataflow/test_fpgadataflow_pwpolyf.py | 281 ++++++++++++++
 18 files changed, 2045 insertions(+)
 create mode 100644 docs/finn/pwpolyf.md
 create mode 100644 finn-rtllib/pwpolyf/hdl/pwpolyf.sv
 create mode 100644 finn-rtllib/pwpolyf/hdl/pwpolyf_coeffs.svh
 create mode 100644 finn-rtllib/pwpolyf/hdl/pwpolyf_template_wrapper.v
 create mode 100755 finn-rtllib/pwpolyf/hdl/queue.sv
 create mode 100644 src/finn/custom_op/fpgadataflow/pwpolyf.py
 create mode 100644 src/finn/custom_op/fpgadataflow/rtl/pwpolyf_rtl.py
 create mode 100644 src/finn/util/pwpolyf.py
 create mode 100644 tests/fpgadataflow/test_fpgadataflow_pwpolyf.py

diff --git a/docs/finn/internals.rst b/docs/finn/internals.rst
index 0fd6c42350..438e64b077 100644
--- a/docs/finn/internals.rst
+++ b/docs/finn/internals.rst
@@ -247,6 +247,9 @@ Constraints to folding factors per layer
    * - Pool
      - PE
      - inp_channels % PE == 0
+   * - PWPolyF
+     - PE
+     - NumChannels % PE == 0
    * - Thresholding
      - PE
      - MH % PE == 0
diff --git a/docs/finn/pwpolyf.md b/docs/finn/pwpolyf.md
new file mode 100644
index 0000000000..c155470bae
--- /dev/null
+++ b/docs/finn/pwpolyf.md
@@ -0,0 +1,123 @@
+# PWPolyF — Piecewise Polynomial Activation
+
+## Overview
+
+PWPolyF is a hardware activation layer that approximates nonlinear functions
+(GELU, SiLU, Sigmoid, Tanh) using degree-2 piecewise polynomials. Each segment
+is evaluated via Horner's method on two cascaded DSPFP32 FMA units, giving
+single-cycle-per-element throughput with no BRAM usage.
+
+The input domain is partitioned into `1 + 2*5*(2^K)` segments: one near-zero
+region, positive octave sub-segments, and negative mirrors. With the default
+K=3 this gives 81 segments. Segment selection reuses the FP32
+exponent/mantissa bit-fields directly, matching the RTL implementation.
+
+Polynomial coefficients are generated at HDL build time by
+`generate_coeffs_svh()` in `pwpolyf_sim.py`, which fits degree-2 polynomials
+to the reference PyTorch functions and writes the `pwpolyf_coeffs.svh` header.
+This ensures the RTL coefficients always match the configured K value.
+
+> **Note:** The RTL currently only supports K=3. Support for other K values
+> is planned for a future update to `pwpolyf.sv`.
+
+## Architecture
+
+PWPolyF is **RTL-only** (no HLS variant). The pipeline is:
+
+```
+PiecewisePolyActivation (PyTorch)
+    |  torch.onnx.export (dynamo=False)
+    v
+PWPolyF ONNX node
+    |  InferPWPolyFLayer
+    v
+PWPolyF HW op (finn.custom_op.fpgadataflow)
+    |  SpecializeLayers
+    v
+PWPolyF_rtl (finn.custom_op.fpgadataflow.rtl)
+    |  generate_hdl
+    v
+finn-rtllib/pwpolyf/hdl/ SystemVerilog IP
+```
+
+## Folding
+
+PWPolyF uses PE parallelism. `NumChannels % PE == 0` must hold.
+Each PE instantiates its own polynomial evaluation pipeline (2 DSPs).
+`SetFolding` handles PE selection automatically.
+
+| PE | DSPs | Approx LUTs | Cycles (per spatial position) |
+|----|------|-------------|-------------------------------|
+| 1  | 2    | 200         | NumChannels                   |
+| C  | 2C   | 200C        | 1                             |
+
+## Resource estimates
+
+- **DSP:** 2 per PE (two FP32 FMA stages)
+- **LUT:** ~200 per PE (segment address decode + control)
+- **BRAM/URAM:** 0 (coefficients stored in LUT/registers)
+
+## ONNX export
+
+`PiecewisePolyActivation` exports as a single `PWPolyF` custom op via
+`torch.autograd.Function.symbolic()`. Requires the legacy TorchScript exporter
+(`dynamo=False` in `torch.onnx.export`).
+
+Attributes on the ONNX node:
+- `func` (string): one of `gelu`, `silu`, `sigmoid`, `tanh`
+- `K` (int): mantissa subdivision bits (default 3)
+
+## Node attributes (HW op)
+
+| Attribute          | Type   | Description                              |
+|--------------------|--------|------------------------------------------|
+| `func`             | string | Activation function name                 |
+| `K`                | int    | Mantissa subdivision bits                |
+| `NumChannels`      | int    | Number of channels (last input dim)      |
+| `PE`               | int    | Processing elements                      |
+| `inputDataType`    | string | Input data type (FLOAT32)                |
+| `outputDataType`   | string | Output data type (FLOAT32)               |
+| `numInputVectors`  | ints   | Batch/spatial dimensions                 |
+
+## Supported functions
+
+| Function | Negative clamp | Positive behaviour |
+|----------|---------------|--------------------|
+| GELU     | 0.0           | passthrough (y=x)  |
+| SiLU     | 0.0           | passthrough (y=x)  |
+| Sigmoid  | 0.0           | clamp to 1.0       |
+| Tanh     | -1.0          | clamp to 1.0       |
+
+## Files
+
+### Python
+
+| File | Purpose |
+|------|---------|
+| `custom_op/fpgadataflow/pwpolyf.py` | Base HW op (shape, folding, resource estimates, cppsim) |
+| `custom_op/fpgadataflow/rtl/pwpolyf_rtl.py` | RTL backend (HDL generation, coefficient SVH generation, rtlsim, IPI) |
+| `util/pwpolyf.py` | PyTorch activation module, ONNX export, software simulation |
+| `transformation/fpgadataflow/convert_to_hw_layers.py` | `InferPWPolyFLayer` transformation |
+| `builder/build_dataflow_steps.py` | Build pipeline integration |
+| `transformation/fpgadataflow/set_folding.py` | Folding support (pe_ops list) |
+
+### RTL
+
+| File | Purpose |
+|------|---------|
+| `finn-rtllib/pwpolyf/hdl/pwpolyf.sv` | Core polynomial evaluation pipeline |
+| `finn-rtllib/pwpolyf/hdl/pwpolyf_coeffs.svh` | Default K=3 coefficients (regenerated at build time) |
+| `finn-rtllib/pwpolyf/hdl/queue.sv` | Elastic FIFO for backpressure |
+| `finn-rtllib/pwpolyf/hdl/pwpolyf_template_wrapper.v` | AXI-Stream wrapper template |
+
+## Tests
+
+`tests/fpgadataflow/test_fpgadataflow_pwpolyf.py` — 68 parametrized tests:
+
+- **cppsim**: all 4 functions x 2 channel counts x 2 spatial shapes x 3 foldings
+- **ONNX export**: verifies single-node export for all functions
+- **InferPWPolyFLayer**: end-to-end export → transform → execute
+- **SpecializeLayers**: verifies RTL specialization
+- **Resource estimates**: DSP/LUT/BRAM checks across PE values
+- **Folded shapes**: input/output/stream width calculations
+- **Expected cycles**: cycle count estimation + analysis pass integration
diff --git a/docs/finn/source_code/finn.custom_op.fpgadataflow.rst b/docs/finn/source_code/finn.custom_op.fpgadataflow.rst
index 25aafc324e..7660ea6dd3 100644
--- a/docs/finn/source_code/finn.custom_op.fpgadataflow.rst
+++ b/docs/finn/source_code/finn.custom_op.fpgadataflow.rst
@@ -144,6 +144,15 @@ finn.custom\_op.fpgadataflow.pool
    :undoc-members:
    :show-inheritance:
 
+finn.custom\_op.fpgadataflow.pwpolyf
+--------------------------------------
+
+.. automodule:: finn.custom_op.fpgadataflow.pwpolyf
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+
 finn.custom\_op.fpgadataflow.streamingdataflowpartition
 --------------------------------------------------------
 
diff --git a/docs/finn/source_code/finn.custom_op.fpgadataflow.rtl.rst b/docs/finn/source_code/finn.custom_op.fpgadataflow.rtl.rst
index 346eddb073..e31176462f 100644
--- a/docs/finn/source_code/finn.custom_op.fpgadataflow.rtl.rst
+++ b/docs/finn/source_code/finn.custom_op.fpgadataflow.rtl.rst
@@ -45,6 +45,14 @@ finn.custom\_op.fpgadataflow.streamingfifo\_rtl
    :undoc-members:
    :show-inheritance:
 
+finn.custom\_op.fpgadataflow.pwpolyf\_rtl
+--------------------------------------------
+
+.. automodule:: finn.custom_op.fpgadataflow.rtl.pwpolyf_rtl
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
 finn.custom\_op.fpgadataflow.thresholding\_rtl
 -------------------------------------------------------
 
diff --git a/docs/finn/source_code/finn.util.rst b/docs/finn/source_code/finn.util.rst
index 2ec1502441..fb9b8ddfff 100644
--- a/docs/finn/source_code/finn.util.rst
+++ b/docs/finn/source_code/finn.util.rst
@@ -171,6 +171,14 @@ finn.util.pytorch
  :show-inheritance:
 
 
+finn.util.pwpolyf
+-------------------
+
+.. automodule:: finn.util.pwpolyf
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
 finn.util.pyverilator
 ---------------------
 
diff --git a/finn-rtllib/pwpolyf/hdl/pwpolyf.sv b/finn-rtllib/pwpolyf/hdl/pwpolyf.sv
new file mode 100644
index 0000000000..51196a9db6
--- /dev/null
+++ b/finn-rtllib/pwpolyf/hdl/pwpolyf.sv
@@ -0,0 +1,356 @@
+/****************************************************************************
+ * Copyright (C) 2026, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * @brief	FP32 piecewise polynomial activation on DSPFP32.
+ * @author	Shane Fleming <shane.fleming@amd.com>
+ *
+ * @description
+ *	Supports GELU, SiLU, Sigmoid, and Tanh via `parameter string FUNC`.
+ *
+ *	Approximated by piecewise degree-2 polynomials over segments defined
+ *	by FP32 bit-extraction.  Evaluated via Horner's method on a chain of
+ *	2 DSPFP32 instances, each computing FMA: out = C + A*B.
+ *
+ *	Horner: y = a_0 + x*(a_1 + a_2*x)
+ *	  Stage 0: out = a_1 + a_2 * x        (A=coeff[2], B=x, C=coeff[1])
+ *	  Stage 1: out = a_0 + prev * x       (A=prev,     B=x, C=coeff[0])
+ *
+ *	Clamping for |x| >= 8 (5 octaves):
+ *	  GELU/SiLU:  neg -> 0,   pos -> x  (pass-through)
+ *	  Sigmoid:    neg -> 0,   pos -> 1.0
+ *	  Tanh:       neg -> -1,  pos -> 1.0
+ *
+ *	Latency: 8 cycles (2 DSP stages x 4 cycles each).  II=1.
+ ***************************************************************************/
+
+//===----------------------------------------------------------------------===//
+// Single DSPFP32 FMA wrapper: r = c + a * b
+//===----------------------------------------------------------------------===//
+module pwpolyf_dspfp32 (
+	input  logic         clk,
+	input  logic         rst,
+
+	input  logic [31:0]  a,
+	input  logic [31:0]  b,
+	input  logic [31:0]  c,
+
+	output logic [31:0]  r,
+	input  logic         rvld
+);
+
+	// FMA opmode: FPA_OUT = C + A*B
+	//  FPOPMODE[6:5] = 00 (no sign flip on C or M)
+	//  FPOPMODE[4:2] = 110 (select C for W mux, M for Z mux -- add path)
+	//  FPOPMODE[1:0] = 01 (FP mode enable)
+	localparam logic [6:0]  MODE_FMA = 7'b00_110_01;
+
+	logic  invalid;
+	logic  overflow;
+	logic  underflow;
+
+	DSPFP32 #(
+		.A_FPTYPE("B32"),
+		.A_INPUT("DIRECT"),
+		.BCASCSEL("B"),
+		.B_D_FPTYPE("B32"),
+		.B_INPUT("DIRECT"),
+		.PCOUTSEL("FPA"),
+		.USE_MULT("MULTIPLY"),
+		.IS_CLK_INVERTED(1'b0),
+		.IS_FPINMODE_INVERTED(1'b0),
+		.IS_FPOPMODE_INVERTED(7'b0000000),
+		.IS_RSTA_INVERTED(1'b0),
+		.IS_RSTB_INVERTED(1'b0),
+		.IS_RSTC_INVERTED(1'b0),
+		.IS_RSTD_INVERTED(1'b0),
+		.IS_RSTFPA_INVERTED(1'b0),
+		.IS_RSTFPINMODE_INVERTED(1'b0),
+		.IS_RSTFPMPIPE_INVERTED(1'b0),
+		.IS_RSTFPM_INVERTED(1'b0),
+		.IS_RSTFPOPMODE_INVERTED(1'b0),
+		.ACASCREG(1),
+		.AREG(1),
+		.FPA_PREG(1),
+		.FPBREG(1),
+		.FPCREG(3),          // C needs 3 pipeline stages to align with M output
+		.FPDREG(0),
+		.FPMPIPEREG(1),
+		.FPM_PREG(1),
+		.FPOPMREG(0),
+		.INMODEREG(0),
+		.RESET_MODE("SYNC")
+	) DSPFP32_inst (
+		.ACOUT_EXP(), .ACOUT_MAN(), .ACOUT_SIGN(),
+		.BCOUT_EXP(), .BCOUT_MAN(), .BCOUT_SIGN(),
+		.PCOUT(),
+		.FPM_INVALID(), .FPM_OVERFLOW(), .FPM_UNDERFLOW(), .FPM_OUT(),
+		.FPA_INVALID(invalid), .FPA_OVERFLOW(overflow), .FPA_UNDERFLOW(underflow), .FPA_OUT(r),
+		.ACIN_EXP('x), .ACIN_MAN('x), .ACIN_SIGN('x),
+		.BCIN_EXP('x), .BCIN_MAN('x), .BCIN_SIGN('x),
+		.PCIN('x),
+		.CLK(clk),
+		.FPINMODE('1),       // Select B path (not D)
+		.FPOPMODE(MODE_FMA),
+		.A_SIGN(a[31]), .A_EXP(a[30:23]), .A_MAN(a[22:0]),
+		.B_SIGN(b[31]), .B_EXP(b[30:23]), .B_MAN(b[22:0]),
+		.C(c),
+		.D_SIGN('x), .D_EXP('x), .D_MAN('x),
+		.ASYNC_RST('0),
+		.CEA1('0), .CEA2('1),
+		.CEB('1), .CEC('1), .CED('0),
+		.CEFPA('1), .CEFPINMODE('0), .CEFPM('1), .CEFPMPIPE('1), .CEFPOPMODE('0),
+		.RSTA('0), .RSTB('0), .RSTC('0), .RSTD('0),
+		.RSTFPA('0), .RSTFPINMODE('0), .RSTFPM('0), .RSTFPMPIPE('0), .RSTFPOPMODE('0)
+	);
+
+	// Simulation-time warnings
+	always_ff @(posedge clk) begin
+		if(!rst && rvld) begin
+			assert(!invalid) else $warning("%m generated invalid output.");
+			assert(!overflow) else $warning("%m generated an overflow.");
+			assert(!underflow) else $warning("%m generated an underflow.");
+		end
+	end
+
+endmodule : pwpolyf_dspfp32
+
+//===----------------------------------------------------------------------===//
+// Full PE-wide streaming activation with piecewise polynomial approximation.
+// Hardcoded for DEGREE=2 from pwpolyf_coeffs.svh.
+//===----------------------------------------------------------------------===//
+module pwpolyf #(
+	int unsigned  PE = 1,
+	string  FUNC = "gelu"
+)(
+	// Global Control
+	input	logic  clk,
+	input	logic  rst,
+
+	// Input Stream - PE elements wide
+	input	logic [PE-1:0][31:0]  xdat,
+	input	logic  xvld,
+	output	logic  xrdy,
+
+	// Output Stream - PE elements wide
+	output	logic [PE-1:0][31:0]  ydat,
+	output	logic  yvld,
+	input	logic  yrdy
+);
+
+	`include "pwpolyf_coeffs.svh"
+
+	localparam int unsigned  K           = PWPOLYF_K;
+	localparam int unsigned  NUM_SEGS    = PWPOLYF_NUM_SEGS;
+	localparam int unsigned  NUM_SUBS    = 1 << K;
+	localparam int unsigned  NUM_OCTAVES = PWPOLYF_NUM_OCTAVES;
+	localparam int unsigned  DSP_LAT     = 4;
+	localparam int unsigned  LATENCY     = 2 * DSP_LAT;  // DEGREE=2
+
+	initial begin
+		assert(PWPOLYF_DEGREE == 2) else begin
+			$error("%m: This implementation requires PWPOLYF_DEGREE == 2.");
+			$finish;
+		end
+		assert(FUNC == "gelu" || FUNC == "silu" || FUNC == "sigmoid" || FUNC == "tanh") else begin
+			$error("%m: Unsupported FUNC=\"%s\". Must be gelu|silu|sigmoid|tanh.", FUNC);
+			$finish;
+		end
+	end
+
+	//=== Per-activation clamping parameters ==================================
+	localparam logic [31:0]  NEG_CLAMP_VAL =
+		FUNC == "tanh" ? 32'hBF800000 : 32'h00000000;  // tanh: -1.0, else: 0.0
+	localparam logic [31:0]  POS_CLAMP_VAL =
+		(FUNC == "sigmoid" || FUNC == "tanh") ? 32'h3F800000 : 32'h00000000;  // sigmoid/tanh: 1.0
+	localparam bit  POS_PASSTHROUGH =
+		(FUNC == "gelu" || FUNC == "silu") ? 1 : 0;  // gelu/silu: output=x
+
+	//=== Coefficient selection ===============================================
+	localparam logic [31:0]  COEFFS[NUM_SEGS][3] =
+		FUNC == "gelu"    ? PWPOLYF_GELU_COEFFS :
+		FUNC == "silu"    ? PWPOLYF_SILU_COEFFS :
+		FUNC == "sigmoid" ? PWPOLYF_SIGMOID_COEFFS :
+		                    PWPOLYF_TANH_COEFFS;
+
+	//=== Clamping exponent threshold =========================================
+	localparam int unsigned  EXP_CLAMP = 130;  // |x| >= 8.0
+
+	//=== Input Sidestep Register =============================================
+	typedef logic [PE-1:0][31:0]  fp_vec_t;
+
+	uwire  take;
+
+	typedef struct {
+		fp_vec_t  val;
+		logic     rdy;
+	} ibuf_t;
+	ibuf_t  Ibuf = '{ val: 'x, rdy: '1 };
+	always_ff @(posedge clk) begin
+		if(rst)
+			Ibuf <= '{ val: 'x, rdy: '1 };
+		else begin
+			if(Ibuf.rdy)  Ibuf.val <= xdat;
+			Ibuf.rdy <= (Ibuf.rdy && !xvld) || take;
+		end
+	end
+	assign	xrdy = Ibuf.rdy;
+	uwire fp_vec_t  x_cur = Ibuf.rdy? xdat : Ibuf.val;
+
+	//=== Credit-based Operation Issue ========================================
+	localparam int unsigned  CREDIT = LATENCY + 3;  // pipeline + sidestep + queue read
+	logic signed [$clog2(CREDIT):0]  Credit = -CREDIT;
+	uwire  give = yvld && yrdy;
+	assign	take = (xvld || !xrdy) && Credit[$left(Credit)];
+	always_ff @(posedge clk) begin
+		if(rst)  Credit <= -CREDIT;
+		else     Credit <= Credit + (give == take? 0 : give? -1 : 1);
+	end
+
+	//=== Per-PE Compute Pipeline =============================================
+	uwire fp_vec_t  r;
+	uwire [PE-1:0]  rvld_vec;
+	uwire  rvld;
+
+	for(genvar  pe = 0; pe < PE; pe++) begin : genPE
+		uwire [31:0]  xi = x_cur[pe];
+
+		//--- Segment selector (combinational) --------------------------------
+		uwire         sign = xi[31];
+		uwire [7:0]   exp_bits = xi[30:23];
+		uwire [K-1:0] sub  = xi[22:23-K];
+
+		// Octave index: exp 125->0, 126->1, 127->2, 128->3, 129->4
+		uwire [2:0]  octave = exp_bits - 8'd125;
+
+		// Classify
+		uwire  is_near_zero = (exp_bits < 8'd125);
+		uwire  is_pos_clamp = !sign && (exp_bits >= EXP_CLAMP);
+		uwire  is_neg_clamp =  sign && (exp_bits >= EXP_CLAMP);
+
+		// Segment index for ROM lookup
+		uwire [6:0]  seg_idx;
+		if(1) begin : blkSegIdx
+			uwire [6:0]  pos_idx = 7'd1 + {1'b0, octave, sub};
+			uwire [6:0]  neg_idx = 7'(7'd1 + NUM_SUBS * NUM_OCTAVES) + {1'b0, octave, sub};
+			assign	seg_idx = is_near_zero? 7'd0 :
+			                  sign? neg_idx : pos_idx;
+		end : blkSegIdx
+
+		//--- Coefficient lookup (combinational) ------------------------------
+		uwire [31:0]  coeff_a0 = COEFFS[seg_idx][0];
+		uwire [31:0]  coeff_a1 = COEFFS[seg_idx][1];
+		uwire [31:0]  coeff_a2 = COEFFS[seg_idx][2];
+
+		//--- Horner chain: 2 stages of pwpolyf_dspfp32 ----------------------
+		// Stage 0: s0 = a1 + a2 * x   (latency: 4 cycles)
+		// Stage 1: s1 = a0 + s0 * x   (latency: 4 cycles)
+		// Total: 8 cycles
+
+		// Valid pipeline
+		logic [LATENCY-1:0]  Vld = '0;
+		always_ff @(posedge clk) begin
+			if(rst)  Vld <= '0;
+			else     Vld <= { Vld[$left(Vld)-1:0], take };
+		end
+		assign	rvld_vec[pe] = Vld[$left(Vld)];
+
+		// Delay x by 4 cycles for stage 1 input
+		logic [31:0]  Xd1 = 'x;
+		logic [31:0]  Xd2 = 'x;
+		logic [31:0]  Xd3 = 'x;
+		logic [31:0]  Xd4 = 'x;
+		always_ff @(posedge clk) begin
+			Xd1 <= xi;
+			Xd2 <= Xd1;
+			Xd3 <= Xd2;
+			Xd4 <= Xd3;
+		end
+
+		// Delay x by 8 cycles for pass-through on positive clamp
+		logic [31:0]  Xd5 = 'x;
+		logic [31:0]  Xd6 = 'x;
+		logic [31:0]  Xd7 = 'x;
+		logic [31:0]  Xd8 = 'x;
+		always_ff @(posedge clk) begin
+			Xd5 <= Xd4;
+			Xd6 <= Xd5;
+			Xd7 <= Xd6;
+			Xd8 <= Xd7;
+		end
+
+		// Delay a0 by 4 cycles for stage 1 C input
+		logic [31:0]  C0d1 = 'x;
+		logic [31:0]  C0d2 = 'x;
+		logic [31:0]  C0d3 = 'x;
+		logic [31:0]  C0d4 = 'x;
+		always_ff @(posedge clk) begin
+			C0d1 <= coeff_a0;
+			C0d2 <= C0d1;
+			C0d3 <= C0d2;
+			C0d4 <= C0d3;
+		end
+
+		// Stage 0: s0 = coeff_a1 + coeff_a2 * xi
+		uwire [31:0]  s0;
+		pwpolyf_dspfp32 dsp0 (
+			.clk, .rst,
+			.a(coeff_a2), .b(xi), .c(coeff_a1),
+			.r(s0), .rvld(Vld[3])
+		);
+
+		// Stage 1: s1 = a0_delayed + s0 * x_delayed
+		uwire [31:0]  s1;
+		pwpolyf_dspfp32 dsp1 (
+			.clk, .rst,
+			.a(s0), .b(Xd4), .c(C0d4),
+			.r(s1), .rvld(Vld[7])
+		);
+
+		//--- Clamp mux -------------------------------------------------------
+		logic [LATENCY-1:0]  NegClamp = '0;
+		logic [LATENCY-1:0]  PosClamp = '0;
+		always_ff @(posedge clk) begin
+			if(rst) begin
+				NegClamp <= '0;
+				PosClamp <= '0;
+			end
+			else begin
+				NegClamp <= { NegClamp[$left(NegClamp)-1:0], is_neg_clamp };
+				PosClamp <= { PosClamp[$left(PosClamp)-1:0], is_pos_clamp };
+			end
+		end
+
+		// Output mux
+		assign	r[pe] = NegClamp[$left(NegClamp)]? NEG_CLAMP_VAL :
+		                 PosClamp[$left(PosClamp)]? (POS_PASSTHROUGH? Xd8 : POS_CLAMP_VAL) :
+		                 s1;
+
+	end : genPE
+
+	// All PE results should be valid simultaneously
+	assign	rvld = rvld_vec[0];
+	always_ff @(posedge clk) begin
+		assert(rvld_vec == {(PE){rvld}}) else begin
+			$error("%m: Inconsistent output valid indications.");
+			$stop;
+		end
+	end
+
+	//=== Credit-backing Elastic Output Queue =================================
+	uwire  rrdy;
+	queue #(.DATA_WIDTH($bits(fp_vec_t)), .ELASTICITY(CREDIT)) obuf (
+		.clk, .rst,
+		.idat(r), .ivld(rvld), .irdy(rrdy),
+		.odat(ydat), .ovld(yvld), .ordy(yrdy)
+	);
+	always_ff @(posedge clk) begin
+		assert(rrdy || !rvld) else begin
+			$error("%m: Result queue overrun.");
+			$stop;
+		end
+	end
+
+endmodule : pwpolyf
diff --git a/finn-rtllib/pwpolyf/hdl/pwpolyf_coeffs.svh b/finn-rtllib/pwpolyf/hdl/pwpolyf_coeffs.svh
new file mode 100644
index 0000000000..4783a69a8c
--- /dev/null
+++ b/finn-rtllib/pwpolyf/hdl/pwpolyf_coeffs.svh
@@ -0,0 +1,344 @@
+// Auto-generated by pwpolyf_sim.py — do not edit manually.
+// K=3, NUM_SEGS=81, NUM_OCTAVES=5, DEGREE=2
+
+localparam int unsigned PWPOLYF_K          = 3;
+localparam int unsigned PWPOLYF_NUM_SEGS   = 81;
+localparam int unsigned PWPOLYF_NUM_OCTAVES = 5;
+localparam int unsigned PWPOLYF_DEGREE     = 2;
+
+localparam logic [31:0] PWPOLYF_GELU_COEFFS[81][3] = '{
+    '{ 32'h37B98E70, 32'h3F000000, 32'h3ECA71FE },  // seg 0
+    '{ 32'hBA7ADBC7, 32'h3F0278C5, 32'h3EBE3708 },  // seg 1
+    '{ 32'hBAC1FAC3, 32'h3F036C36, 32'h3EBAD62D },  // seg 2
+    '{ 32'hBB0F7119, 32'h3F049537, 32'h3EB7205C },  // seg 3
+    '{ 32'hBB4C3199, 32'h3F05F665, 32'h3EB31D7A },  // seg 4
+    '{ 32'hBB8CE270, 32'h3F0793DC, 32'h3EAECF5D },  // seg 5
+    '{ 32'hBBBD42FF, 32'h3F096FF1, 32'h3EAA3BDC },  // seg 6
+    '{ 32'hBBF86AD7, 32'h3F0B8C93, 32'h3EA5686A },  // seg 7
+    '{ 32'hBC1FE938, 32'h3F0DEDAD, 32'h3EA05544 },  // seg 8
+    '{ 32'hBC61967B, 32'h3F11FAE4, 32'h3E985544 },  // seg 9
+    '{ 32'hBCA9F9E2, 32'h3F1853F6, 32'h3E8D0DA0 },  // seg 10
+    '{ 32'hBCF4024D, 32'h3F1FBA03, 32'h3E81380E },  // seg 11
+    '{ 32'hBD283275, 32'h3F281F23, 32'h3E6A0515 },  // seg 12
+    '{ 32'hBD6012DC, 32'h3F316E8A, 32'h3E5131A8 },  // seg 13
+    '{ 32'hBD90E65A, 32'h3F3B8AB1, 32'h3E384EA9 },  // seg 14
+    '{ 32'hBDB69B12, 32'h3F46505E, 32'h3E1FAEE2 },  // seg 15
+    '{ 32'hBDE0E236, 32'h3F519680, 32'h3E07A0FB },  // seg 16
+    '{ 32'hBE13A447, 32'h3F62F98E, 32'h3DCA8920 },  // seg 17
+    '{ 32'hBE483C53, 32'h3F7A5DD5, 32'h3D6E8B3A },  // seg 18
+    '{ 32'hBE7FADD1, 32'h3F8848AA, 32'h3CC08378 },  // seg 19
+    '{ 32'hBE9AF2F6, 32'h3F9227E3, 32'hBB967C2C },  // seg 20
+    '{ 32'hBEB35A6C, 32'h3F9A4E1A, 32'hBCD3D2D3 },  // seg 21
+    '{ 32'hBEC733E2, 32'h3FA06D38, 32'hBD2655D9 },  // seg 22
+    '{ 32'hBED5117D, 32'h3FA46699, 32'hBD4ACA8A },  // seg 23
+    '{ 32'hBEDC221E, 32'h3FA64B98, 32'hBD5B0CC8 },  // seg 24
+    '{ 32'hBED977F6, 32'h3FA5A98F, 32'hBD563F50 },  // seg 25
+    '{ 32'hBEC1A7EF, 32'h3FA066C3, 32'hBD310A4D },  // seg 26
+    '{ 32'hBE9AF247, 32'h3F98AA16, 32'hBCFF13C8 },  // seg 27
+    '{ 32'hBE609014, 32'h3F90E5FD, 32'hBCA4952F },  // seg 28
+    '{ 32'hBE1465DA, 32'h3F8A89C8, 32'hBC412BC3 },  // seg 29
+    '{ 32'hBDB39147, 32'h3F860470, 32'hBBCFC6D8 },  // seg 30
+    '{ 32'hBD47BD32, 32'h3F832984, 32'hBB4E0A09 },  // seg 31
+    '{ 32'hBCCC65EE, 32'h3F8187F7, 32'hBABC9CBA },  // seg 32
+    '{ 32'hBC07A969, 32'h3F807817, 32'hB9D51508 },  // seg 33
+    '{ 32'hBABA43A4, 32'h3F8012B3, 32'hB870A727 },  // seg 34
+    '{ 32'hB93762D6, 32'h3F800216, 32'hB6C22359 },  // seg 35
+    '{ 32'h3411E06E, 32'h3F800000, 32'h27AB3551 },  // seg 36
+    '{ 32'h341E8FEE, 32'h3F800000, 32'h28A3B0E4 },  // seg 37
+    '{ 32'h342B3EFE, 32'h3F800000, 32'hA7CDB1C0 },  // seg 38
+    '{ 32'h3437EE42, 32'h3F800000, 32'hA8538CE4 },  // seg 39
+    '{ 32'h34449DB9, 32'h3F800000, 32'hA71AF986 },  // seg 40
+    '{ 32'hBA7AD37E, 32'h3EFB0E96, 32'h3EBE3747 },  // seg 41
+    '{ 32'hBAC20DB3, 32'h3EF92715, 32'h3EBAD556 },  // seg 42
+    '{ 32'hBB0F6B5B, 32'h3EF6D5D9, 32'h3EB720C8 },  // seg 43
+    '{ 32'hBB4C290B, 32'h3EF41395, 32'h3EB31DFE },  // seg 44
+    '{ 32'hBB8CE04D, 32'h3EF0D873, 32'h3EAECF95 },  // seg 45
+    '{ 32'hBBBD43D9, 32'h3EED2010, 32'h3EAA3BCD },  // seg 46
+    '{ 32'hBBF87DB4, 32'h3EE8E58C, 32'h3EA566F9 },  // seg 47
+    '{ 32'hBC1FE3E0, 32'h3EE42555, 32'h3EA055F8 },  // seg 48
+    '{ 32'hBC6197C8, 32'h3EDC0A12, 32'h3E98551F },  // seg 49
+    '{ 32'hBCA9FA8F, 32'h3ECF57EF, 32'h3E8D0D83 },  // seg 50
+    '{ 32'hBCF40310, 32'h3EC08BD5, 32'h3E8137F1 },  // seg 51
+    '{ 32'hBD2834D6, 32'h3EAFC0E3, 32'h3E6A03EA },  // seg 52
+    '{ 32'hBD6013D1, 32'h3E9D229D, 32'h3E513144 },  // seg 53
+    '{ 32'hBD90E5AD, 32'h3E88EB08, 32'h3E384F28 },  // seg 54
+    '{ 32'hBDB6985F, 32'h3E66C185, 32'h3E1FB08A },  // seg 55
+    '{ 32'hBDE0DF77, 32'h3E39A8DB, 32'h3E07A275 },  // seg 56
+    '{ 32'hBE13A40F, 32'h3DE8345F, 32'h3DCA8983 },  // seg 57
+    '{ 32'hBE483C8C, 32'h3CB44289, 32'h3D6E8AAF },  // seg 58
+    '{ 32'hBE7FAE75, 32'hBD848C95, 32'h3CC08085 },  // seg 59
+    '{ 32'hBE9AF25E, 32'hBE113D70, 32'hBB9669BA },  // seg 60
+    '{ 32'hBEB35AF3, 32'hBE527226, 32'hBCD3D632 },  // seg 61
+    '{ 32'hBEC73409, 32'hBE81B50C, 32'hBD265643 },  // seg 62
+    '{ 32'hBED511FB, 32'hBE919AF1, 32'hBD4ACBC1 },  // seg 63
+    '{ 32'hBEDC2198, 32'hBE992DD4, 32'hBD5B0BA5 },  // seg 64
+    '{ 32'hBED9784C, 32'hBE96A68E, 32'hBD563FE9 },  // seg 65
+    '{ 32'hBEC1A80B, 32'hBE819B22, 32'hBD310A73 },  // seg 66
+    '{ 32'hBE9AF281, 32'hBE45510D, 32'hBCFF1457 },  // seg 67
+    '{ 32'hBE60906A, 32'hBE073026, 32'hBCA4958A },  // seg 68
+    '{ 32'hBE146346, 32'hBDA8992A, 32'hBC41276D },  // seg 69
+    '{ 32'hBDB38EDC, 32'hBD408B15, 32'hBBCFC36C },  // seg 70
+    '{ 32'hBD47AFC4, 32'hBCCA5237, 32'hBB4DF9C3 },  // seg 71
+    '{ 32'hBCCC9FAA, 32'hBC443685, 32'hBABCD98B },  // seg 72
+    '{ 32'hBC07BF94, 32'hBB7057D9, 32'hB9D53AA2 },  // seg 73
+    '{ 32'hBABACF46, 32'hBA160B9E, 32'hB8715A94 },  // seg 74
+    '{ 32'hB93E544F, 32'hB88BBAC8, 32'hB6CD58B9 },  // seg 75
+    '{ 32'hB80D0FA0, 32'hB7425B86, 32'hB585D73F },  // seg 76
+    '{ 32'h00000000, 32'h00000000, 32'h00000000 },  // seg 77
+    '{ 32'h00000000, 32'h00000000, 32'h00000000 },  // seg 78
+    '{ 32'h00000000, 32'h00000000, 32'h00000000 },  // seg 79
+    '{ 32'h00000000, 32'h00000000, 32'h00000000 }  // seg 80
+};
+
+localparam logic [31:0] PWPOLYF_SILU_COEFFS[81][3] = '{
+    '{ 32'h36E95DF5, 32'h3F000000, 32'h3E7EDC5E },  // seg 0
+    '{ 32'hB99F1DCE, 32'h3F00C86D, 32'h3E771EC2 },  // seg 1
+    '{ 32'hB9F6B213, 32'h3F01162E, 32'h3E74F652 },  // seg 2
+    '{ 32'hBA36FFF7, 32'h3F017588, 32'h3E72946C },  // seg 3
+    '{ 32'hBA82DBFB, 32'h3F01E7EE, 32'h3E6FFB3C },  // seg 4
+    '{ 32'hBAB54FCB, 32'h3F026E5A, 32'h3E6D2ECC },  // seg 5
+    '{ 32'hBAF49B79, 32'h3F030A0C, 32'h3E6A30AE },  // seg 6
+    '{ 32'hBB212F9B, 32'h3F03BBB5, 32'h3E6704CC },  // seg 7
+    '{ 32'hBB50782F, 32'h3F048563, 32'h3E63A86F },  // seg 8
+    '{ 32'hBB945CE5, 32'h3F05E1D4, 32'h3E5E4873 },  // seg 9
+    '{ 32'hBBE25E30, 32'h3F080BF2, 32'h3E569788 },  // seg 10
+    '{ 32'hBC24C259, 32'h3F0A9F94, 32'h3E4E59B3 },  // seg 11
+    '{ 32'hBC66B42F, 32'h3F0D9E75, 32'h3E45A38E },  // seg 12
+    '{ 32'hBC9C5244, 32'h3F11080E, 32'h3E3C8A8D },  // seg 13
+    '{ 32'hBCCE04C3, 32'h3F14DA5B, 32'h3E3322D2 },  // seg 14
+    '{ 32'hBD04800C, 32'h3F191096, 32'h3E298289 },  // seg 15
+    '{ 32'hBD26E43B, 32'h3F1DA640, 32'h3E1FBAD7 },  // seg 16
+    '{ 32'hBD637E9F, 32'h3F252125, 32'h3E10F462 },  // seg 17
+    '{ 32'hBDA33B9D, 32'h3F301FA3, 32'h3DFAD009 },  // seg 18
+    '{ 32'hBDDE6B32, 32'h3F3BF5F1, 32'h3DD4EBDE },  // seg 19
+    '{ 32'hBE1121CB, 32'h3F484C5B, 32'h3DB103B4 },  // seg 20
+    '{ 32'hBE369CC7, 32'h3F54CBA1, 32'h3D8FABD4 },  // seg 21
+    '{ 32'hBE5EB1E1, 32'h3F612225, 32'h3D6290D3 },  // seg 22
+    '{ 32'hBE842968, 32'h3F6D086D, 32'h3D2C2202 },  // seg 23
+    '{ 32'hBE99340A, 32'h3F7842D6, 32'h3CF862F4 },  // seg 24
+    '{ 32'hBEB7B0F6, 32'h3F83AB48, 32'h3C810FDC },  // seg 25
+    '{ 32'hBEDD21FD, 32'h3F8C031A, 32'h3AA0895B },  // seg 26
+    '{ 32'hBEFBE1D7, 32'h3F922E5A, 32'hBC0A6628 },  // seg 27
+    '{ 32'hBF0931A3, 32'h3F9649B5, 32'hBC6A59DD },  // seg 28
+    '{ 32'hBF101E8F, 32'h3F989B85, 32'hBC8E0AB0 },  // seg 29
+    '{ 32'hBF12EEED, 32'h3F997B24, 32'hBC96B85F },  // seg 30
+    '{ 32'hBF121CE8, 32'h3F994071, 32'hBC94AB86 },  // seg 31
+    '{ 32'hBF0E4CE0, 32'h3F983CFA, 32'hBC8C0C06 },  // seg 32
+    '{ 32'hBF047881, 32'h3F95D0C8, 32'hBC71DF34 },  // seg 33
+    '{ 32'hBEE597C0, 32'h3F91E386, 32'hBC3A0377 },  // seg 34
+    '{ 32'hBEBEB59D, 32'h3F8DFF08, 32'hBC081E5A },  // seg 35
+    '{ 32'hBE994535, 32'h3F8A965C, 32'hBBC0C1D9 },  // seg 36
+    '{ 32'hBE700CF7, 32'h3F87CFE4, 32'hBB856F55 },  // seg 37
+    '{ 32'hBE37FE86, 32'h3F85A6F4, 32'hBB35A189 },  // seg 38
+    '{ 32'hBE0A8F49, 32'h3F8406CD, 32'hBAF420DA },  // seg 39
+    '{ 32'hBDCD8C89, 32'h3F82D4E6, 32'hBAA265F3 },  // seg 40
+    '{ 32'hB99F15B4, 32'h3EFE6F36, 32'h3E771F07 },  // seg 41
+    '{ 32'hB9F6E275, 32'h3EFDD355, 32'h3E74F54A },  // seg 42
+    '{ 32'hBA370764, 32'h3EFD14DB, 32'h3E729434 },  // seg 43
+    '{ 32'hBA82CBD2, 32'h3EFC307E, 32'h3E6FFC3C },  // seg 44
+    '{ 32'hBAB519A0, 32'h3EFB2461, 32'h3E6D318E },  // seg 45
+    '{ 32'hBAF49002, 32'h3EF9EC20, 32'h3E6A3138 },  // seg 46
+    '{ 32'hBB213894, 32'h3EF88848, 32'h3E670422 },  // seg 47
+    '{ 32'hBB509289, 32'h3EF6F461, 32'h3E63A6AF },  // seg 48
+    '{ 32'hBB946026, 32'h3EF43C28, 32'h3E5E481D },  // seg 49
+    '{ 32'hBBE25D58, 32'h3EEFE827, 32'h3E56979C },  // seg 50
+    '{ 32'hBC24C311, 32'h3EEAC0C7, 32'h3E4E599C },  // seg 51
+    '{ 32'hBC66B0C8, 32'h3EE4C363, 32'h3E45A3FC },  // seg 52
+    '{ 32'hBC9C5250, 32'h3EDDEFE3, 32'h3E3C8A8A },  // seg 53
+    '{ 32'hBCCE0263, 32'h3ED64BA6, 32'h3E33233F },  // seg 54
+    '{ 32'hBD0483B9, 32'h3ECDDDD0, 32'h3E29816B },  // seg 55
+    '{ 32'hBD26E21A, 32'h3EC4B40E, 32'h3E1FBB6C },  // seg 56
+    '{ 32'hBD637E94, 32'h3EB5BDB7, 32'h3E10F463 },  // seg 57
+    '{ 32'hBDA33BD9, 32'h3E9FC0A2, 32'h3DFACFDF },  // seg 58
+    '{ 32'hBDDE6AB7, 32'h3E88144E, 32'h3DD4EC2A },  // seg 59
+    '{ 32'hBE11213C, 32'h3E5ECF5B, 32'h3DB10440 },  // seg 60
+    '{ 32'hBE369BA8, 32'h3E2CD2EE, 32'h3D8FACC1 },  // seg 61
+    '{ 32'hBE5EB20C, 32'h3DF6EE82, 32'h3D6290A4 },  // seg 62
+    '{ 32'hBE8428DF, 32'h3D97BEFB, 32'h3D2C2354 },  // seg 63
+    '{ 32'hBE9933B8, 32'h3CF7AA94, 32'h3CF8644F },  // seg 64
+    '{ 32'hBEB7B09B, 32'hBCEACCA1, 32'h3C811126 },  // seg 65
+    '{ 32'hBEDD2230, 32'hBDC0323A, 32'h3AA081A5 },  // seg 66
+    '{ 32'hBEFBE177, 32'hBE11723B, 32'hBC0A645F },  // seg 67
+    '{ 32'hBF093217, 32'hBE324EF2, 32'hBC6A5D77 },  // seg 68
+    '{ 32'hBF101F44, 32'hBE44DDF5, 32'hBC8E0CF8 },  // seg 69
+    '{ 32'hBF12EEFF, 32'hBE4BD952, 32'hBC96B899 },  // seg 70
+    '{ 32'hBF121E42, 32'hBE4A0685, 32'hBC94AED3 },  // seg 71
+    '{ 32'hBF0E4E2A, 32'hBE41EA78, 32'hBC8C0EC0 },  // seg 72
+    '{ 32'hBF047922, 32'hBE2E876E, 32'hBC71E175 },  // seg 73
+    '{ 32'hBEE5994E, 32'hBE0F1D79, 32'hBC3A059D },  // seg 74
+    '{ 32'hBEBEB455, 32'hBDDFEE7F, 32'hBC081CCC },  // seg 75
+    '{ 32'hBE9948EA, 32'hBDA96AD6, 32'hBBC0C8E9 },  // seg 76
+    '{ 32'hBE701045, 32'hBD7A00B1, 32'hBB8571EB },  // seg 77
+    '{ 32'hBE3805EB, 32'hBD34E735, 32'hBB35ABED },  // seg 78
+    '{ 32'hBE0A9538, 32'hBD00E03E, 32'hBAF42F6A },  // seg 79
+    '{ 32'hBDCD9AF0, 32'hBCB54853, 32'hBAA2755A }  // seg 80
+};
+
+localparam logic [31:0] PWPOLYF_SIGMOID_COEFFS[81][3] = '{
+    '{ 32'h3F000000, 32'h3E7F33B4, 32'hB21FFF88 },  // seg 0
+    '{ 32'h3EFFCF27, 32'h3E822CCD, 32'hBC84C1F2 },  // seg 1
+    '{ 32'h3EFFBC74, 32'h3E82B1D2, 32'hBC938C36 },  // seg 2
+    '{ 32'h3EFFA5B5, 32'h3E834361, 32'hBCA21BEF },  // seg 3
+    '{ 32'h3EFF8A9F, 32'h3E83E0E0, 32'hBCB06BD4 },  // seg 4
+    '{ 32'h3EFF6B53, 32'h3E8487A2, 32'hBCBE4EDE },  // seg 5
+    '{ 32'h3EFF47DE, 32'h3E853610, 32'hBCCBB848 },  // seg 6
+    '{ 32'h3EFF1FD8, 32'h3E85ED00, 32'hBCD8C92D },  // seg 7
+    '{ 32'h3EFEF3E7, 32'h3E86A89A, 32'hBCE54D90 },  // seg 8
+    '{ 32'h3EFEA7FC, 32'h3E87D4B7, 32'hBCF7D81E },  // seg 9
+    '{ 32'h3EFE3434, 32'h3E897082, 32'hBD075E74 },  // seg 10
+    '{ 32'h3EFDB0A7, 32'h3E8B15AC, 32'hBD11E821 },  // seg 11
+    '{ 32'h3EFD1FFF, 32'h3E8CBAB3, 32'hBD1B7B94 },  // seg 12
+    '{ 32'h3EFC85A0, 32'h3E8E568F, 32'hBD2411C8 },  // seg 13
+    '{ 32'h3EFBE681, 32'h3E8FDE72, 32'hBD2B9C7D },  // seg 14
+    '{ 32'h3EFB46C8, 32'h3E914BCF, 32'hBD322448 },  // seg 15
+    '{ 32'h3EFAAB5A, 32'h3E9297AA, 32'hBD37AD8F },  // seg 16
+    '{ 32'h3EF9DBB3, 32'h3E9432BF, 32'hBD3E0993 },  // seg 17
+    '{ 32'h3EF909EB, 32'h3E95A9C9, 32'hBD43470B },  // seg 18
+    '{ 32'h3EF8B573, 32'h3E9632E0, 32'hBD450418 },  // seg 19
+    '{ 32'h3EF909F7, 32'h3E95B9B4, 32'hBD43A89C },  // seg 20
+    '{ 32'h3EFA2B65, 32'h3E943959, 32'hBD3FAB94 },  // seg 21
+    '{ 32'h3EFC33ED, 32'h3E91B9FE, 32'hBD3988A9 },  // seg 22
+    '{ 32'h3EFF34ED, 32'h3E8E4C2C, 32'hBD31B440 },  // seg 23
+    '{ 32'h3F01997B, 32'h3E8A0ACB, 32'hBD28A198 },  // seg 24
+    '{ 32'h3F057A8B, 32'h3E8262C5, 32'hBD198434 },  // seg 25
+    '{ 32'h3F0C3064, 32'h3E6CECF8, 32'hBD0452A9 },  // seg 26
+    '{ 32'h3F1447E1, 32'h3E530782, 32'hBCDF318F },  // seg 27
+    '{ 32'h3F1D4A68, 32'h3E38CE80, 32'hBCB905FB },  // seg 28
+    '{ 32'h3F26C195, 32'h3E1F8C72, 32'hBC9750E8 },  // seg 29
+    '{ 32'h3F3044A9, 32'h3E081DC6, 32'hBC74E58A },  // seg 30
+    '{ 32'h3F3982E1, 32'h3DE5F174, 32'hBC448461 },  // seg 31
+    '{ 32'h3F424019, 32'h3DC09FBD, 32'hBC1CAB6F },  // seg 32
+    '{ 32'h3F4DF478, 32'h3D924A5B, 32'hBBDD9BF8 },  // seg 33
+    '{ 32'h3F5B2AC7, 32'h3D464E25, 32'hBB897CAF },  // seg 34
+    '{ 32'h3F656FAC, 32'h3D045E36, 32'hBB291861 },  // seg 35
+    '{ 32'h3F6D25C5, 32'h3CAEBA7A, 32'hBACED519 },  // seg 36
+    '{ 32'h3F72CA96, 32'h3C64B5A7, 32'hBA7C2B6D },  // seg 37
+    '{ 32'h3F76D7D9, 32'h3C14B4A4, 32'hBA196B60 },  // seg 38
+    '{ 32'h3F79B538, 32'h3BC060EC, 32'hB9BA7A5A },  // seg 39
+    '{ 32'h3F7BB5E3, 32'h3B77B8AD, 32'hB9626A80 },  // seg 40
+    '{ 32'h3F001880, 32'h3E822DEE, 32'h3C84E3F8 },  // seg 41
+    '{ 32'h3F0021CE, 32'h3E82B23F, 32'h3C939805 },  // seg 42
+    '{ 32'h3F002D26, 32'h3E834368, 32'h3CA21C3C },  // seg 43
+    '{ 32'h3F003A99, 32'h3E83DFE3, 32'h3CB05650 },  // seg 44
+    '{ 32'h3F004A09, 32'h3E84848C, 32'h3CBE0FC2 },  // seg 45
+    '{ 32'h3F005BFD, 32'h3E853553, 32'h3CCBAA1A },  // seg 46
+    '{ 32'h3F007027, 32'h3E85EDAA, 32'h3CD8D518 },  // seg 47
+    '{ 32'h3F008689, 32'h3E86AC9A, 32'h3CE58F3C },  // seg 48
+    '{ 32'h3F00AC00, 32'h3E87D4A7, 32'h3CF7D732 },  // seg 49
+    '{ 32'h3F00E601, 32'h3E89713C, 32'h3D076365 },  // seg 50
+    '{ 32'h3F0127C8, 32'h3E8B1652, 32'h3D11EC06 },  // seg 51
+    '{ 32'h3F016FFD, 32'h3E8CBAA0, 32'h3D1B7B31 },  // seg 52
+    '{ 32'h3F01BD0C, 32'h3E8E55D8, 32'h3D240E23 },  // seg 53
+    '{ 32'h3F020CD3, 32'h3E8FDED1, 32'h3D2B9E4A },  // seg 54
+    '{ 32'h3F025CCD, 32'h3E914CAB, 32'h3D322819 },  // seg 55
+    '{ 32'h3F02AA33, 32'h3E929729, 32'h3D37AB80 },  // seg 56
+    '{ 32'h3F031236, 32'h3E9432FB, 32'h3D3E0A78 },  // seg 57
+    '{ 32'h3F037AF9, 32'h3E95A98C, 32'h3D43463B },  // seg 58
+    '{ 32'h3F03A527, 32'h3E96327F, 32'h3D4502ED },  // seg 59
+    '{ 32'h3F037B01, 32'h3E95B9AA, 32'h3D43A880 },  // seg 60
+    '{ 32'h3F02EA67, 32'h3E94399C, 32'h3D3FAC3D },  // seg 61
+    '{ 32'h3F01E5FE, 32'h3E91B9E3, 32'h3D39886A },  // seg 62
+    '{ 32'h3F00659B, 32'h3E8E4C54, 32'h3D31B499 },  // seg 63
+    '{ 32'h3EFCCCD9, 32'h3E8A0A9A, 32'h3D28A135 },  // seg 64
+    '{ 32'h3EF50AC9, 32'h3E8262A6, 32'h3D1983F9 },  // seg 65
+    '{ 32'h3EE79F36, 32'h3E6CECF3, 32'h3D0452A4 },  // seg 66
+    '{ 32'h3ED77017, 32'h3E530747, 32'h3CDF3133 },  // seg 67
+    '{ 32'h3EC56B50, 32'h3E38CEAF, 32'h3CB9063C },  // seg 68
+    '{ 32'h3EB27DCB, 32'h3E1F8DAD, 32'h3C97527C },  // seg 69
+    '{ 32'h3E9F76E9, 32'h3E081E0B, 32'h3C74E62F },  // seg 70
+    '{ 32'h3E8CFAEA, 32'h3DE5F2F3, 32'h3C448608 },  // seg 71
+    '{ 32'h3E770090, 32'h3DC0A0B8, 32'h3C1CAC71 },  // seg 72
+    '{ 32'h3E482E7E, 32'h3D924AB4, 32'h3BDD9CA1 },  // seg 73
+    '{ 32'h3E13552B, 32'h3D464E9B, 32'h3B897D12 },  // seg 74
+    '{ 32'h3DD481AF, 32'h3D045D7B, 32'h3B29173F },  // seg 75
+    '{ 32'h3D96D28F, 32'h3CAEBB74, 32'h3ACED672 },  // seg 76
+    '{ 32'h3D5358A4, 32'h3C64B82E, 32'h3A7C2E99 },  // seg 77
+    '{ 32'h3D1283F5, 32'h3C14B667, 32'h3A196D6B },  // seg 78
+    '{ 32'h3CC95645, 32'h3BC05DE7, 32'h39BA76F4 },  // seg 79
+    '{ 32'h3C8947BE, 32'h3B77C111, 32'h39627305 }  // seg 80
+};
+
+localparam logic [31:0] PWPOLYF_TANH_COEFFS[81][3] = '{
+    '{ 32'h24C775B8, 32'h3F7CD991, 32'hA73006D1 },  // seg 0
+    '{ 32'hBBAC00F6, 32'h3F87D4AF, 32'hBE77D79E },  // seg 1
+    '{ 32'hBBE5F686, 32'h3F8970F2, 32'hBE87616C },  // seg 2
+    '{ 32'hBC13E04D, 32'h3F8B1626, 32'hBE91EAFF },  // seg 3
+    '{ 32'hBC38062F, 32'h3F8CBAF6, 32'hBE9B7D12 },  // seg 4
+    '{ 32'hBC5E87C9, 32'h3F8E55E8, 32'hBEA40E6C },  // seg 5
+    '{ 32'hBC833183, 32'h3F8FDE94, 32'hBEAB9D26 },  // seg 6
+    '{ 32'hBC97304B, 32'h3F914C74, 32'hBEB22725 },  // seg 7
+    '{ 32'hBCAA905C, 32'h3F929762, 32'hBEB7AC6C },  // seg 8
+    '{ 32'hBCC48DB5, 32'h3F9432FE, 32'hBEBE0A82 },  // seg 9
+    '{ 32'hBCDEBF5B, 32'h3F95A99C, 32'hBEC34673 },  // seg 10
+    '{ 32'hBCE94C8C, 32'h3F9632A2, 32'hBEC50358 },  // seg 11
+    '{ 32'hBCDEBE2C, 32'h3F95B992, 32'hBEC3A83E },  // seg 12
+    '{ 32'hBCBA9ED9, 32'h3F9439CF, 32'hBEBFACBF },  // seg 13
+    '{ 32'hBC72F4C3, 32'h3F91B9B3, 32'hBEB987F9 },  // seg 14
+    '{ 32'hBB4B4F04, 32'h3F8E4C70, 32'hBEB1B4D9 },  // seg 15
+    '{ 32'h3C4CBE96, 32'h3F8A0AC9, 32'hBEA8A196 },  // seg 16
+    '{ 32'h3D2F5351, 32'h3F8262A8, 32'hBE9983FC },  // seg 17
+    '{ 32'h3DC30600, 32'h3F6CED04, 32'hBE8452B2 },  // seg 18
+    '{ 32'h3E223FEB, 32'h3F53072C, 32'hBE5F310C },  // seg 19
+    '{ 32'h3E6A52FA, 32'h3F38CE9A, 32'hBE39061F },  // seg 20
+    '{ 32'h3E9B0431, 32'h3F1F8DD2, 32'hBE1752AC },  // seg 21
+    '{ 32'h3EC1122B, 32'h3F081E0D, 32'hBDF4E633 },  // seg 22
+    '{ 32'h3EE60A55, 32'h3EE5F2C5, 32'hBDC485D6 },  // seg 23
+    '{ 32'h3F047F8C, 32'h3EC0A114, 32'hBD9CACD1 },  // seg 24
+    '{ 32'h3F1BE8C6, 32'h3E924AAB, 32'hBD5D9C90 },  // seg 25
+    '{ 32'h3F365584, 32'h3E464E44, 32'hBD097CC9 },  // seg 26
+    '{ 32'h3F4ADF8C, 32'h3E045D95, 32'hBCA91767 },  // seg 27
+    '{ 32'h3F5A4B6D, 32'h3DAEBB15, 32'hBC4ED5EF },  // seg 28
+    '{ 32'h3F6594EE, 32'h3D64B815, 32'hBBFC2E78 },  // seg 29
+    '{ 32'h3F6DAF91, 32'h3D14B5D0, 32'hBB996CB5 },  // seg 30
+    '{ 32'h3F736A9F, 32'h3CC05DAA, 32'hBB3A76B3 },  // seg 31
+    '{ 32'h3F776B54, 32'h3C77C747, 32'hBAE2796C },  // seg 32
+    '{ 32'h3F7B290A, 32'h3C00F553, 32'hBA590596 },  // seg 33
+    '{ 32'h3F7DD423, 32'h3B51CDAD, 32'hB99FBCDD },  // seg 34
+    '{ 32'h3F7F0B0E, 32'h3AA91301, 32'hB8EB151C },  // seg 35
+    '{ 32'h3F7F95A3, 32'h3A073EF5, 32'hB82D041E },  // seg 36
+    '{ 32'h3F7FD274, 32'h3956AD85, 32'hB77E46D5 },  // seg 37
+    '{ 32'h3F7FECAF, 32'h38A9A5DF, 32'hB6BB164C },  // seg 38
+    '{ 32'h3F7FF7E0, 32'h3805A163, 32'hB609E0DD },  // seg 39
+    '{ 32'h3F7FFCA1, 32'h37505C2F, 32'hB549DC3A },  // seg 40
+    '{ 32'h3BAC00F6, 32'h3F87D4AF, 32'h3E77D79E },  // seg 41
+    '{ 32'h3BE5F686, 32'h3F8970F2, 32'h3E87616C },  // seg 42
+    '{ 32'h3C13E04D, 32'h3F8B1626, 32'h3E91EAFF },  // seg 43
+    '{ 32'h3C38062F, 32'h3F8CBAF6, 32'h3E9B7D12 },  // seg 44
+    '{ 32'h3C5E87C9, 32'h3F8E55E8, 32'h3EA40E6C },  // seg 45
+    '{ 32'h3C833183, 32'h3F8FDE94, 32'h3EAB9D26 },  // seg 46
+    '{ 32'h3C97304B, 32'h3F914C74, 32'h3EB22725 },  // seg 47
+    '{ 32'h3CAA905C, 32'h3F929762, 32'h3EB7AC6C },  // seg 48
+    '{ 32'h3CC48DB5, 32'h3F9432FE, 32'h3EBE0A82 },  // seg 49
+    '{ 32'h3CDEBF5B, 32'h3F95A99C, 32'h3EC34673 },  // seg 50
+    '{ 32'h3CE94C8C, 32'h3F9632A2, 32'h3EC50358 },  // seg 51
+    '{ 32'h3CDEBE2C, 32'h3F95B992, 32'h3EC3A83E },  // seg 52
+    '{ 32'h3CBA9ED9, 32'h3F9439CF, 32'h3EBFACBF },  // seg 53
+    '{ 32'h3C72F4C3, 32'h3F91B9B3, 32'h3EB987F9 },  // seg 54
+    '{ 32'h3B4B4F04, 32'h3F8E4C70, 32'h3EB1B4D9 },  // seg 55
+    '{ 32'hBC4CBE96, 32'h3F8A0AC9, 32'h3EA8A196 },  // seg 56
+    '{ 32'hBD2F5351, 32'h3F8262A8, 32'h3E9983FC },  // seg 57
+    '{ 32'hBDC30600, 32'h3F6CED04, 32'h3E8452B2 },  // seg 58
+    '{ 32'hBE223FEB, 32'h3F53072C, 32'h3E5F310C },  // seg 59
+    '{ 32'hBE6A52FA, 32'h3F38CE9A, 32'h3E39061F },  // seg 60
+    '{ 32'hBE9B0431, 32'h3F1F8DD2, 32'h3E1752AC },  // seg 61
+    '{ 32'hBEC1122B, 32'h3F081E0D, 32'h3DF4E633 },  // seg 62
+    '{ 32'hBEE60A55, 32'h3EE5F2C5, 32'h3DC485D6 },  // seg 63
+    '{ 32'hBF047F8C, 32'h3EC0A114, 32'h3D9CACD1 },  // seg 64
+    '{ 32'hBF1BE8C6, 32'h3E924AAB, 32'h3D5D9C90 },  // seg 65
+    '{ 32'hBF365584, 32'h3E464E44, 32'h3D097CC9 },  // seg 66
+    '{ 32'hBF4ADF8C, 32'h3E045D95, 32'h3CA91767 },  // seg 67
+    '{ 32'hBF5A4B6D, 32'h3DAEBB15, 32'h3C4ED5EF },  // seg 68
+    '{ 32'hBF6594EE, 32'h3D64B815, 32'h3BFC2E78 },  // seg 69
+    '{ 32'hBF6DAF91, 32'h3D14B5D0, 32'h3B996CB5 },  // seg 70
+    '{ 32'hBF736A9F, 32'h3CC05DAA, 32'h3B3A76B3 },  // seg 71
+    '{ 32'hBF776B54, 32'h3C77C747, 32'h3AE2796C },  // seg 72
+    '{ 32'hBF7B290A, 32'h3C00F553, 32'h3A590596 },  // seg 73
+    '{ 32'hBF7DD423, 32'h3B51CDAD, 32'h399FBCDD },  // seg 74
+    '{ 32'hBF7F0B0E, 32'h3AA91301, 32'h38EB151C },  // seg 75
+    '{ 32'hBF7F95A3, 32'h3A073EF5, 32'h382D041E },  // seg 76
+    '{ 32'hBF7FD274, 32'h3956AD85, 32'h377E46D5 },  // seg 77
+    '{ 32'hBF7FECAF, 32'h38A9A5DF, 32'h36BB164C },  // seg 78
+    '{ 32'hBF7FF7E0, 32'h3805A163, 32'h3609E0DD },  // seg 79
+    '{ 32'hBF7FFCA1, 32'h37505C2F, 32'h3549DC3A }  // seg 80
+};
+
diff --git a/finn-rtllib/pwpolyf/hdl/pwpolyf_template_wrapper.v b/finn-rtllib/pwpolyf/hdl/pwpolyf_template_wrapper.v
new file mode 100644
index 0000000000..eecf2ac74d
--- /dev/null
+++ b/finn-rtllib/pwpolyf/hdl/pwpolyf_template_wrapper.v
@@ -0,0 +1,69 @@
+/******************************************************************************
+ * Copyright (C) 2026, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @brief	Verilog wrapper for pwpolyf IP packaging.
+ */
+
+module $MODULE_NAME_AXI_WRAPPER$ #(
+	parameter  PE = $PE$,
+	parameter  FUNC = $FUNC$
+)(
+	(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF in0_V:out_V, ASSOCIATED_RESET ap_rst_n" *)
+	(* X_INTERFACE_INFO = "xilinx.com:signal:clock:1.0 ap_clk CLK" *)
+	input	ap_clk,
+	(* X_INTERFACE_PARAMETER = "POLARITY ACTIVE_LOW" *)
+	input	ap_rst_n,
+
+	//- AXI Stream - Input --------------
+	output	in0_V_TREADY,
+	input	in0_V_TVALID,
+	input [$IN_WIDTH$-1:0]	in0_V_TDATA,
+
+	//- AXI Stream - Output -------------
+	input	out_V_TREADY,
+	output	out_V_TVALID,
+	output [$OUT_WIDTH$-1:0]	out_V_TDATA
+);
+
+	pwpolyf #(
+		.PE(PE),
+		.FUNC(FUNC)
+	) core (
+		.clk(ap_clk),
+		.rst(!ap_rst_n),
+		.xdat(in0_V_TDATA),
+		.xvld(in0_V_TVALID),
+		.xrdy(in0_V_TREADY),
+		.ydat(out_V_TDATA),
+		.yvld(out_V_TVALID),
+		.yrdy(out_V_TREADY)
+	);
+
+endmodule // $MODULE_NAME_AXI_WRAPPER$
diff --git a/finn-rtllib/pwpolyf/hdl/queue.sv b/finn-rtllib/pwpolyf/hdl/queue.sv
new file mode 100755
index 0000000000..e5c3cf9889
--- /dev/null
+++ b/finn-rtllib/pwpolyf/hdl/queue.sv
@@ -0,0 +1,78 @@
+/****************************************************************************
+ * Copyright (C) 2025, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * @author	Thomas B. Preußer <thomas.preusser@amd.com>
+ ***************************************************************************/
+
+module queue #(
+	int unsigned  DATA_WIDTH,
+	int unsigned  ELASTICITY
+)(
+	input	logic  clk,
+	input	logic  rst,
+
+	input	logic [DATA_WIDTH-1:0]  idat,
+	input	logic  ivld,
+	output	logic  irdy,
+
+	output	logic [DATA_WIDTH-1:0]  odat,
+	output	logic  ovld,
+	input	logic  ordy
+);
+
+	typedef logic [DATA_WIDTH-1:0]  dat_t;
+	initial begin
+		if(ELASTICITY < 2) begin
+			$error("%m: ELASTICITY of %0d must be made 2 or above.", ELASTICITY);
+			$finish;
+		end
+	end
+
+	logic signed [$clog2(ELASTICITY):0]  Ptr = '1;	// -1, 0, 1, ..., ELASTICITY-1
+	logic  Rdy = 1;
+	dat_t  A[ELASTICITY];
+	assign	irdy = Rdy;
+
+	logic  Vld = 0;
+	dat_t  B = 'x;
+	assign	odat = B;
+	assign	ovld = Vld;
+
+	uwire  bload = !Vld || ordy;
+	uwire  push = Rdy && ivld;
+	uwire  pop = !Ptr[$left(Ptr)] && bload;
+
+	always_ff @(posedge clk) begin
+		if(push)  A <= { idat, A[0:ELASTICITY-2] };
+	end
+
+	always_ff @(posedge clk) begin
+		if(rst) begin
+			Ptr <= '1;
+			Rdy <= 1;
+			Vld <= 0;
+			B <= 'x;
+		end
+		else begin
+			// Make sure Rdy encodes what it's supposed to: space available in queue
+			assert(Rdy == (Ptr < signed'(ELASTICITY-1))) else begin
+				$error("%m: Broken Rdy computation.");
+				$stop;
+			end
+
+			Ptr <= Ptr + ((push == pop)? 0 : push? 1 : -1);
+			//  pop ==  push: no change
+			//  pop && !push: new space
+			// !pop &&  push: remaining space if not yet Ptr == ELASTICITY-2
+			Rdy <= (pop == push)? Rdy : pop? 1 : Ptr[$left(Ptr)] || (((ELASTICITY-2) & ~Ptr[$left(Ptr)-1:0]) != 0);
+			if(bload) begin
+				Vld <= !Ptr[$left(Ptr)];
+				B <= A[Ptr[$left(Ptr)-1:0]];
+			end
+		end
+	end
+
+endmodule : queue
diff --git a/src/finn/builder/build_dataflow_steps.py b/src/finn/builder/build_dataflow_steps.py
index ecc1d28c53..dad7910e5c 100644
--- a/src/finn/builder/build_dataflow_steps.py
+++ b/src/finn/builder/build_dataflow_steps.py
@@ -350,6 +350,8 @@ def step_convert_to_hw(model: ModelWrapper, cfg: DataflowBuildConfig):
     model = model.transform(to_hw.InferLabelSelectLayer())
     # input quantization (if any) as standalone threshold
     model = model.transform(to_hw.InferThresholdingLayer())
+    # piecewise polynomial activations (GELU, SiLU, Sigmoid, Tanh)
+    model = model.transform(to_hw.InferPWPolyFLayer())
     # needed for convolutions -- TODO always exec?
     need_conv = len(model.get_nodes_by_op_type("Im2Col")) > 0
     if need_conv:
diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py
index aed2ab7fe1..c924b538a0 100644
--- a/src/finn/custom_op/fpgadataflow/__init__.py
+++ b/src/finn/custom_op/fpgadataflow/__init__.py
@@ -42,6 +42,7 @@
 from finn.custom_op.fpgadataflow.lookup import Lookup
 from finn.custom_op.fpgadataflow.matrixvectoractivation import MVAU
 from finn.custom_op.fpgadataflow.pool import Pool
+from finn.custom_op.fpgadataflow.pwpolyf import PWPolyF
 from finn.custom_op.fpgadataflow.streamingdataflowpartition import (
     StreamingDataflowPartition,
 )
@@ -76,6 +77,7 @@
 custom_op["LabelSelect"] = LabelSelect
 custom_op["Lookup"] = Lookup
 custom_op["Pool"] = Pool
+custom_op["PWPolyF"] = PWPolyF
 custom_op["StreamingConcat"] = StreamingConcat
 custom_op["StreamingDataWidthConverter"] = StreamingDataWidthConverter
 custom_op["StreamingEltwise"] = StreamingEltwise
diff --git a/src/finn/custom_op/fpgadataflow/pwpolyf.py b/src/finn/custom_op/fpgadataflow/pwpolyf.py
new file mode 100644
index 0000000000..e05ba9c2aa
--- /dev/null
+++ b/src/finn/custom_op/fpgadataflow/pwpolyf.py
@@ -0,0 +1,187 @@
+# Copyright (C) 2026, Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import numpy as np
+from qonnx.core.datatype import DataType
+
+from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp
+
+# Piecewise polynomial constants matching the RTL module
+_NUM_OCTAVES = 5
+_SUPPORTED_FUNCS = {"gelu", "silu", "sigmoid", "tanh"}
+
+
+class PWPolyF(HWCustomOp):
+    """
+    HW op for piecewise polynomial activations (GELU, SiLU, Sigmoid, Tanh).
+
+    Element-wise FP32, coefficients baked into RTL.  No weights or BRAM.
+    """
+
+    def __init__(self, onnx_node, **kwargs):
+        super().__init__(onnx_node, **kwargs)
+
+    def get_nodeattr_types(self):
+        my_attrs = {
+            # activation function: gelu, silu, sigmoid, tanh
+            "func": ("s", True, ""),
+            # top-mantissa subdivision bits (K=3 gives 81 segments)
+            "K": ("i", False, 3),
+            # parallelism; elements processed per cycle
+            "PE": ("i", True, 0),
+            # number of channels (last dimension of input tensor)
+            "NumChannels": ("i", True, 0),
+            # FINN DataTypes for inputs, outputs (always FLOAT32)
+            "inputDataType": ("s", True, ""),
+            "outputDataType": ("s", True, ""),
+            # number of input vectors, examples:
+            # [1] is a single vector (like a FC layer with batch=1)
+            # [4] is four vectors (like a FC layer with batch=4)
+            # [1, 4, 4] is four * four vectors (like a conv layer with batch=1)
+            "numInputVectors": ("ints", False, [1]),
+        }
+        my_attrs.update(super().get_nodeattr_types())
+        return my_attrs
+
+    def get_num_segments(self):
+        K = self.get_nodeattr("K")
+        return 1 + 2 * _NUM_OCTAVES * (1 << K)
+
+    def make_shape_compatible_op(self, model):
+        oshape = self.get_normal_output_shape()
+        return super().make_const_shape_op(oshape)
+
+    def infer_node_datatype(self, model):
+        node = self.onnx_node
+        idt = model.get_tensor_datatype(node.input[0])
+        if idt != self.get_input_datatype():
+            self.set_nodeattr("inputDataType", idt.name)
+        odt = self.get_output_datatype()
+        model.set_tensor_datatype(node.output[0], odt)
+
+    def verify_node(self):
+        info_messages = []
+
+        backend_value = self.get_nodeattr("backend")
+        if backend_value == "fpgadataflow":
+            info_messages.append("Attribute backend is set correctly")
+        else:
+            info_messages.append('Attribute backend should be set to "fpgadataflow"')
+
+        func = self.get_nodeattr("func")
+        if func in _SUPPORTED_FUNCS:
+            info_messages.append("Attribute func is set correctly")
+        else:
+            info_messages.append(
+                "Attribute func must be one of %s, got %s" % (_SUPPORTED_FUNCS, func)
+            )
+
+        pe = self.get_nodeattr("PE")
+        nch = self.get_nodeattr("NumChannels")
+        if pe > 0 and nch > 0 and nch % pe == 0:
+            info_messages.append("PE divides NumChannels")
+        else:
+            info_messages.append("PE must divide NumChannels evenly")
+
+        idt = self.get_nodeattr("inputDataType")
+        if idt != "FLOAT32":
+            info_messages.append("PWPolyF requires FLOAT32 input, got %s" % idt)
+
+        return info_messages
+
+    def get_input_datatype(self, ind=0):
+        """Returns FINN DataType of input."""
+        return DataType[self.get_nodeattr("inputDataType")]
+
+    def get_output_datatype(self, ind=0):
+        """Returns FINN DataType of output."""
+        return DataType[self.get_nodeattr("outputDataType")]
+
+    def get_instream_width(self, ind=0):
+        return self.get_input_datatype().bitwidth() * self.get_nodeattr("PE")
+
+    def get_outstream_width(self, ind=0):
+        return self.get_output_datatype().bitwidth() * self.get_nodeattr("PE")
+
+    def get_folded_input_shape(self, ind=0):
+        pe = self.get_nodeattr("PE")
+        nch = self.get_nodeattr("NumChannels")
+        fold = nch // pe
+        vecs = list(self.get_nodeattr("numInputVectors"))
+        return tuple(vecs + [fold, pe])
+
+    def get_folded_output_shape(self, ind=0):
+        return self.get_folded_input_shape()
+
+    def get_normal_input_shape(self, ind=0):
+        nch = self.get_nodeattr("NumChannels")
+        vecs = list(self.get_nodeattr("numInputVectors"))
+        return tuple(vecs + [nch])
+
+    def get_normal_output_shape(self, ind=0):
+        return self.get_normal_input_shape()
+
+    def get_number_output_values(self):
+        return np.prod(self.get_folded_output_shape()[:-1])
+
+    def get_exp_cycles(self):
+        # II=1, latency amortised over stream length
+        return np.prod(self.get_folded_output_shape()[:-1])
+
+    def lut_estimation(self):
+        pe = self.get_nodeattr("PE")
+        return 200 * pe
+
+    def bram_estimation(self):
+        # coefficients stored in LUT ROM, not BRAM
+        return 0
+
+    def uram_estimation(self):
+        return 0
+
+    def dsp_estimation(self, fpgapart=None):
+        # two DSPFP32 FMA instances per PE (Horner evaluation)
+        pe = self.get_nodeattr("PE")
+        return 2 * pe
+
+    def execute_node(self, context, graph):
+        node = self.onnx_node
+        inp = context[node.input[0]]
+
+        func = self.get_nodeattr("func")
+        K = self.get_nodeattr("K")
+
+        # lazy import to avoid hard dependency on torch at module level
+        import torch
+        from finn.util.pwpolyf import PiecewisePolyActivation
+
+        mod = PiecewisePolyActivation(func, K=K)
+        with torch.no_grad():
+            x = torch.from_numpy(inp.astype(np.float32))
+            y = mod(x)
+        context[node.output[0]] = y.numpy()
diff --git a/src/finn/custom_op/fpgadataflow/rtl/__init__.py b/src/finn/custom_op/fpgadataflow/rtl/__init__.py
index 06067a4fca..6c483ba0d3 100644
--- a/src/finn/custom_op/fpgadataflow/rtl/__init__.py
+++ b/src/finn/custom_op/fpgadataflow/rtl/__init__.py
@@ -35,6 +35,7 @@
     StreamingDataWidthConverter_rtl,
 )
 from finn.custom_op.fpgadataflow.rtl.streamingfifo_rtl import StreamingFIFO_rtl
+from finn.custom_op.fpgadataflow.rtl.pwpolyf_rtl import PWPolyF_rtl
 from finn.custom_op.fpgadataflow.rtl.thresholding_rtl import Thresholding_rtl
 from finn.custom_op.fpgadataflow.rtl.vectorvectoractivation_rtl import VVAU_rtl
 
@@ -48,4 +49,5 @@
 custom_op["StreamingFIFO_rtl"] = StreamingFIFO_rtl
 custom_op["MVAU_rtl"] = MVAU_rtl
 custom_op["VVAU_rtl"] = VVAU_rtl
+custom_op["PWPolyF_rtl"] = PWPolyF_rtl
 custom_op["Thresholding_rtl"] = Thresholding_rtl
diff --git a/src/finn/custom_op/fpgadataflow/rtl/pwpolyf_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/pwpolyf_rtl.py
new file mode 100644
index 0000000000..d4736f7fee
--- /dev/null
+++ b/src/finn/custom_op/fpgadataflow/rtl/pwpolyf_rtl.py
@@ -0,0 +1,289 @@
+# Copyright (C) 2026, Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import numpy as np
+import os
+import struct
+from pyverilator.util.axi_utils import reset_rtlsim, rtlsim_multi_io
+
+from finn.custom_op.fpgadataflow.pwpolyf import PWPolyF
+from finn.custom_op.fpgadataflow.rtlbackend import RTLBackend
+from finn.util.basic import (
+    get_rtlsim_trace_depth,
+    make_build_dir,
+    pyverilate_get_liveness_threshold_cycles,
+)
+from finn.util.data_packing import (
+    npy_to_rtlsim_input,
+    rtlsim_output_to_npy,
+)
+from finn.util.pwpolyf import (
+    NUM_OCTAVES,
+    SUPPORTED_FUNCS,
+    _fit_coefficients,
+)
+
+try:
+    from pyverilator import PyVerilator
+except ModuleNotFoundError:
+    PyVerilator = None
+
+
+def _float_to_hex(f):
+    """Convert a Python float to a 32-bit IEEE 754 hex string."""
+    return "%08X" % struct.unpack("!I", struct.pack("!f", float(f)))[0]
+
+
+def generate_coeffs_svh(K, num_samples=1000):
+    """Generate the pwpolyf_coeffs.svh file content for a given K value."""
+    num_subs = 1 << K
+    num_segs = 1 + 2 * NUM_OCTAVES * num_subs
+
+    lines = []
+    lines.append("// Auto-generated by pwpolyf_rtl.py — do not edit manually.")
+    lines.append("// K=%d, NUM_SEGS=%d, NUM_OCTAVES=%d, DEGREE=2" % (K, num_segs, NUM_OCTAVES))
+    lines.append("")
+    lines.append("localparam int unsigned PWPOLYF_K          = %d;" % K)
+    lines.append("localparam int unsigned PWPOLYF_NUM_SEGS   = %d;" % num_segs)
+    lines.append("localparam int unsigned PWPOLYF_NUM_OCTAVES = %d;" % NUM_OCTAVES)
+    lines.append("localparam int unsigned PWPOLYF_DEGREE     = 2;")
+
+    for func_name in SUPPORTED_FUNCS:
+        coeffs = _fit_coefficients(func_name, K, num_samples)
+        label = "PWPOLYF_%s_COEFFS" % func_name.upper()
+        lines.append("")
+        lines.append("localparam logic [31:0] %s[%d][3] = '{" % (label, num_segs))
+        for seg in range(num_segs):
+            c0 = _float_to_hex(coeffs[seg, 0])
+            c1 = _float_to_hex(coeffs[seg, 1])
+            c2 = _float_to_hex(coeffs[seg, 2])
+            comma = "," if seg < num_segs - 1 else ""
+            lines.append(
+                "    '{ 32'h%s, 32'h%s, 32'h%s }%s  // seg %d"
+                % (c0, c1, c2, comma, seg)
+            )
+        lines.append("};")
+
+    lines.append("")
+    return "\n".join(lines)
+
+
+class PWPolyF_rtl(PWPolyF, RTLBackend):
+    """RTL variant of PWPolyF, wraps the finn-rtllib pwpolyf IP."""
+
+    def __init__(self, onnx_node, **kwargs):
+        super().__init__(onnx_node, **kwargs)
+
+    def get_nodeattr_types(self):
+        my_attrs = {}
+        my_attrs.update(PWPolyF.get_nodeattr_types(self))
+        my_attrs.update(RTLBackend.get_nodeattr_types(self))
+        return my_attrs
+
+    def prepare_codegen_rtl_values(self, model):
+        """Build the substitution dictionary for RTL template files."""
+        code_gen_dict = {}
+
+        pe = self.get_nodeattr("PE")
+        func = self.get_nodeattr("func")
+
+        code_gen_dict["$MODULE_NAME_AXI_WRAPPER$"] = [
+            self.get_verilog_top_module_name() + "_axi_wrapper"
+        ]
+        code_gen_dict["$TOP_MODULE$"] = code_gen_dict["$MODULE_NAME_AXI_WRAPPER$"]
+        code_gen_dict["$PE$"] = [str(pe)]
+        code_gen_dict["$FUNC$"] = ['"%s"' % func]
+        code_gen_dict["$IN_WIDTH$"] = [str(pe * 32)]
+        code_gen_dict["$OUT_WIDTH$"] = [str(pe * 32)]
+
+        return code_gen_dict
+
+    def get_rtl_file_list(self):
+        return [
+            "pwpolyf.sv",
+            "pwpolyf_coeffs.svh",
+            "queue.sv",
+            "pwpolyf_template_wrapper.v",
+        ]
+
+    def get_rtl_file_paths(self):
+        rtl_root_dir = os.environ["FINN_ROOT"] + "/finn-rtllib/pwpolyf/hdl/"
+        rtl_file_list = self.get_rtl_file_list()
+        rtl_file_paths = [rtl_root_dir + f for f in rtl_file_list]
+        return rtl_file_paths
+
+    def get_rtl_template_data(self, path):
+        with open(path, "r") as f:
+            template = f.read()
+        return template
+
+    def fill_in_rtl_template_data(self, replace_dict, template_data):
+        template_data_cp = template_data
+        for key in replace_dict:
+            replacement_line = "\n".join(replace_dict[key])
+            template_data_cp = template_data_cp.replace(key, replacement_line)
+        return template_data_cp
+
+    def dump_rtl_data(self, dest_dir, filename, data):
+        if "template" in filename:
+            filename = self.get_nodeattr("gen_top_module") + ".v"
+        with open(os.path.join(dest_dir, filename), "w") as f:
+            f.write(data)
+
+    def generate_hdl(self, model, fpgapart, clk):
+        code_gen_dict = self.prepare_codegen_rtl_values(model)
+        code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+        self.set_nodeattr("gen_top_module", code_gen_dict["$TOP_MODULE$"][0])
+
+        for rtl_file_path in self.get_rtl_file_paths():
+            template_data = self.get_rtl_template_data(rtl_file_path)
+            data = self.fill_in_rtl_template_data(code_gen_dict, template_data)
+            file_only_path = rtl_file_path.split("/")[-1]
+            self.dump_rtl_data(code_gen_dir, file_only_path, data)
+
+        # generate coefficients .svh matching the node's K value
+        K = self.get_nodeattr("K")
+        svh_data = generate_coeffs_svh(K)
+        with open(os.path.join(code_gen_dir, "pwpolyf_coeffs.svh"), "w") as f:
+            f.write(svh_data)
+
+        self.set_nodeattr("ipgen_path", code_gen_dir)
+        self.set_nodeattr("ip_path", code_gen_dir)
+
+    def prepare_rtlsim(self):
+        if PyVerilator is None:
+            raise ImportError("Installation of PyVerilator is required.")
+
+        code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+        verilog_paths = [code_gen_dir]
+        # exclude .svh — it is pulled in via `include from pwpolyf.sv
+        verilog_files = [
+            x.replace("pwpolyf_template_wrapper", self.get_nodeattr("gen_top_module"))
+            for x in self.get_rtl_file_list()
+            if not x.endswith(".svh")
+        ]
+        single_src_dir = make_build_dir("pyverilator_" + self.onnx_node.name + "_")
+
+        sim = PyVerilator.build(
+            verilog_files,
+            build_dir=single_src_dir,
+            verilog_path=verilog_paths,
+            trace_depth=get_rtlsim_trace_depth(),
+            top_module_name=self.get_nodeattr("gen_top_module"),
+            auto_eval=False,
+        )
+
+        self.set_nodeattr("rtlsim_so", sim.lib._name)
+        return sim
+
+    def execute_node(self, context, graph):
+        mode = self.get_nodeattr("exec_mode")
+        if mode == "cppsim":
+            PWPolyF.execute_node(self, context, graph)
+        elif mode == "rtlsim":
+            node = self.onnx_node
+            code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+
+            expected_inp_shape = self.get_folded_input_shape()
+            reshaped_input = context[node.input[0]].reshape(expected_inp_shape)
+            export_idt = self.get_input_datatype()
+            reshaped_input = reshaped_input.copy()
+            np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input)
+
+            sim = self.get_rtlsim()
+            nbits = self.get_instream_width()
+            inp = npy_to_rtlsim_input(
+                os.path.join(code_gen_dir, "input_0.npy"), export_idt, nbits
+            )
+            io_names = self.get_verilog_top_module_intf_names()
+            istream_name = io_names["s_axis"][0][0]
+            ostream_name = io_names["m_axis"][0][0]
+            io_dict = {
+                "inputs": {istream_name: inp},
+                "outputs": {ostream_name: []},
+            }
+
+            trace_file = self.get_nodeattr("rtlsim_trace")
+            if trace_file == "default":
+                trace_file = self.onnx_node.name + ".vcd"
+            sname = "_"
+
+            num_out_values = self.get_number_output_values()
+            reset_rtlsim(sim)
+            total_cycle_count = rtlsim_multi_io(
+                sim,
+                io_dict,
+                num_out_values,
+                trace_file=trace_file,
+                sname=sname,
+                liveness_threshold=pyverilate_get_liveness_threshold_cycles(),
+            )
+            self.set_nodeattr("cycles_rtlsim", total_cycle_count)
+            output = io_dict["outputs"][ostream_name]
+
+            odt = self.get_output_datatype()
+            target_bits = odt.bitwidth()
+            packed_bits = self.get_outstream_width()
+            out_npy_path = os.path.join(code_gen_dir, "output.npy")
+            out_shape = self.get_folded_output_shape()
+
+            rtlsim_output_to_npy(
+                output, out_npy_path, odt, out_shape, packed_bits, target_bits
+            )
+
+            output = np.load(out_npy_path)
+            oshape = self.get_normal_output_shape()
+            output = np.asarray([output], dtype=np.float32).reshape(*oshape)
+            context[node.output[0]] = output
+        else:
+            raise Exception(
+                "Invalid value for attribute exec_mode! Is currently set to: %s "
+                "has to be one of ('cppsim', 'rtlsim')" % mode
+            )
+
+    def code_generation_ipi(self):
+        rtl_file_list = [
+            x.replace("pwpolyf_template_wrapper", self.get_nodeattr("gen_top_module"))
+            for x in self.get_rtl_file_list()
+        ]
+        code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+        source_target = "./ip/verilog/rtl_ops/%s" % self.onnx_node.name
+        cmd = ["file mkdir %s" % source_target]
+
+        for rtl_file in rtl_file_list:
+            cmd.append(
+                "add_files -copy_to %s -norecurse %s"
+                % (source_target, os.path.join(code_gen_dir, rtl_file))
+            )
+
+        cmd.append(
+            "create_bd_cell -type module -reference %s %s"
+            % (self.get_nodeattr("gen_top_module"), self.onnx_node.name)
+        )
+
+        return cmd
diff --git a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py
index e14181b140..c7e95b28bf 100644
--- a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py
+++ b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py
@@ -274,6 +274,53 @@ def apply(self, model):
         return (model, graph_modified)
 
 
+class InferPWPolyFLayer(Transformation):
+    """Convert PWPolyF nodes into piecewise polynomial activation HW layers."""
+
+    def __init__(self):
+        super().__init__()
+
+    def apply(self, model):
+        graph = model.graph
+        node_ind = 0
+        graph_modified = False
+        for node in graph.node:
+            node_ind += 1
+            if node.op_type == "PWPolyF" and node.domain != "finn.custom_op.fpgadataflow":
+                pwp_input = node.input[0]
+                pwp_output = node.output[0]
+                pwp_in_shape = model.get_tensor_shape(pwp_input)
+                idt = model.get_tensor_datatype(pwp_input)
+
+                func = get_by_name(node.attribute, "func").s.decode("utf-8")
+                K_attr = get_by_name(node.attribute, "K")
+                K = K_attr.i if K_attr is not None else 3
+
+                num_channels = pwp_in_shape[-1]
+
+                new_node = helper.make_node(
+                    "PWPolyF",
+                    [pwp_input],
+                    [pwp_output],
+                    domain="finn.custom_op.fpgadataflow",
+                    backend="fpgadataflow",
+                    func=func,
+                    K=K,
+                    NumChannels=num_channels,
+                    PE=1,
+                    inputDataType=idt.name,
+                    outputDataType=idt.name,
+                    numInputVectors=list(pwp_in_shape[:-1]),
+                    name="PWPolyF_" + node.name,
+                )
+
+                graph.node.insert(node_ind, new_node)
+                graph.node.remove(node)
+                graph_modified = True
+
+        return (model, graph_modified)
+
+
 class InferUpsample(Transformation):
     """Convert Upsample and Resize nodes to layers to UpsampleNearestNeighbour nodes."""
 
diff --git a/src/finn/transformation/fpgadataflow/set_folding.py b/src/finn/transformation/fpgadataflow/set_folding.py
index eaee499e6a..07b2e89f19 100644
--- a/src/finn/transformation/fpgadataflow/set_folding.py
+++ b/src/finn/transformation/fpgadataflow/set_folding.py
@@ -104,6 +104,7 @@ def apply(self, model):
             "ChannelwiseOp_hls",
             "DuplicateStreams_hls",
             "GlobalAccPool_hls",
+            "PWPolyF_rtl",
             "Thresholding_hls",
             "Thresholding_rtl",
         ]
diff --git a/src/finn/util/pwpolyf.py b/src/finn/util/pwpolyf.py
new file mode 100644
index 0000000000..484cfde85c
--- /dev/null
+++ b/src/finn/util/pwpolyf.py
@@ -0,0 +1,236 @@
+# Copyright (C) 2026, Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+"""
+Piecewise polynomial activation - PyTorch module and software model.
+
+Drop-in activation that approximates GELU, SiLU, Sigmoid, and Tanh using
+degree-2 polynomials, matching the pwpolyf RTL behaviour.  Emits a single
+PWPolyF custom op node during ONNX export (requires dynamo=False).
+"""
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+# Constants matching the SystemVerilog module
+NUM_OCTAVES = 5
+EXP_BIAS = 127
+EXP_BASE = 125
+EXP_CLAMP = 130
+
+SUPPORTED_FUNCS = ("gelu", "silu", "sigmoid", "tanh")
+
+REFERENCE_FUNCS = {
+    "gelu": lambda x: F.gelu(x),
+    "silu": lambda x: F.silu(x),
+    "sigmoid": lambda x: torch.sigmoid(x),
+    "tanh": lambda x: torch.tanh(x),
+}
+
+CLAMP_CFG = {
+    "gelu":    {"neg_clamp": 0.0, "pos_clamp": 0.0, "pos_passthrough": True},
+    "silu":    {"neg_clamp": 0.0, "pos_clamp": 0.0, "pos_passthrough": True},
+    "sigmoid": {"neg_clamp": 0.0, "pos_clamp": 1.0, "pos_passthrough": False},
+    "tanh":    {"neg_clamp": -1.0, "pos_clamp": 1.0, "pos_passthrough": False},
+}
+
+
+def _segment_boundaries(K):
+    """Return (lo, hi) bounds for every segment."""
+    num_subs = 1 << K
+    bounds = []
+
+    # Segment 0: near-zero
+    bounds.append((-0.25, 0.25))
+
+    # Positive segments
+    for octave in range(NUM_OCTAVES):
+        exp_val = EXP_BASE + octave - EXP_BIAS
+        base = 2.0 ** exp_val
+        for sub in range(num_subs):
+            lo = base * (1.0 + sub / num_subs)
+            hi = base * (1.0 + (sub + 1) / num_subs)
+            bounds.append((lo, hi))
+
+    # Negative segments (mirror of positive)
+    for octave in range(NUM_OCTAVES):
+        exp_val = EXP_BASE + octave - EXP_BIAS
+        base = 2.0 ** exp_val
+        for sub in range(num_subs):
+            lo = base * (1.0 + sub / num_subs)
+            hi = base * (1.0 + (sub + 1) / num_subs)
+            bounds.append((-hi, -lo))
+
+    return bounds
+
+
+def _fit_coefficients(func_name, K, num_samples=1000):
+    """Fit degree-2 polynomials per segment.  Returns (NUM_SEGS, 3) tensor."""
+    ref_fn = REFERENCE_FUNCS[func_name]
+    bounds = _segment_boundaries(K)
+    num_segs = len(bounds)
+    coeffs = np.zeros((num_segs, 3), dtype=np.float64)
+
+    for seg, (lo, hi) in enumerate(bounds):
+        xs = np.linspace(lo, hi, num_samples, dtype=np.float64)
+        with torch.no_grad():
+            ys = ref_fn(torch.from_numpy(xs).float()).numpy().astype(np.float64)
+        c = np.polynomial.polynomial.polyfit(xs, ys, deg=2)
+        coeffs[seg] = c[:3]
+
+    return torch.from_numpy(coeffs.astype(np.float32))
+
+
+def _segment_index(x, K, num_subs, num_segs):
+    """Map each element to its polynomial segment, mirroring SV addressing."""
+    abs_x = x.abs()
+    is_neg = x < 0
+
+    is_near_zero = abs_x < 0.25
+    is_clamp = abs_x >= 8.0
+    is_neg_clamp = is_neg & is_clamp
+    is_pos_clamp = (~is_neg) & is_clamp
+
+    safe_abs = abs_x.clamp(min=0.25)
+    floor_log2 = torch.floor(torch.log2(safe_abs))
+    octave = (floor_log2 + 2).long().clamp(0, NUM_OCTAVES - 1)
+
+    pow2 = torch.exp2(floor_log2)
+    frac = safe_abs / pow2 - 1.0
+    sub = (frac * num_subs).long().clamp(0, num_subs - 1)
+
+    pos_idx = 1 + octave * num_subs + sub
+    neg_idx = 1 + NUM_OCTAVES * num_subs + octave * num_subs + sub
+
+    seg_idx = torch.where(
+        is_near_zero,
+        torch.zeros_like(pos_idx),
+        torch.where(is_neg, neg_idx, pos_idx),
+    )
+    seg_idx = seg_idx.clamp(0, num_segs - 1)
+
+    return seg_idx, is_neg_clamp, is_pos_clamp
+
+
+class PWPolyFFunction(torch.autograd.Function):
+    """Emits a single PWPolyF ONNX node during export."""
+
+    @staticmethod
+    def forward(ctx, x, coeffs, neg_clamp_val, pos_clamp_val, func, K):
+        num_subs = 1 << K
+        num_segs = 1 + 2 * NUM_OCTAVES * num_subs
+        pos_passthrough = CLAMP_CFG[func]["pos_passthrough"]
+
+        orig_shape = x.shape
+        x_flat = x.contiguous().view(-1)
+
+        seg_idx, is_neg_clamp, is_pos_clamp = _segment_index(
+            x_flat, K, num_subs, num_segs
+        )
+
+        c = coeffs[seg_idx]
+        a0 = c[:, 0]
+        a1 = c[:, 1]
+        a2 = c[:, 2]
+
+        y = a0 + x_flat * (a1 + a2 * x_flat)
+
+        if pos_passthrough:
+            pos_val = x_flat
+        else:
+            pos_val = pos_clamp_val.expand_as(y)
+        y = torch.where(is_pos_clamp, pos_val, y)
+        y = torch.where(is_neg_clamp, neg_clamp_val.expand_as(y), y)
+
+        return y.view(orig_shape)
+
+    @staticmethod
+    def symbolic(g, x, coeffs, neg_clamp_val, pos_clamp_val, func, K):
+        return g.op("PWPolyF", x, func_s=func, K_i=K)
+
+
+class PiecewisePolyActivation(nn.Module):
+    """
+    Drop-in activation matching the pwpolyf hardware behaviour.
+
+    Approximates nonlinear activations using degree-2 polynomials over
+    segments defined by FP32 bit-extraction.  Evaluated via Horner's method.
+    Emits a single PWPolyF custom op node during ONNX export.
+    """
+
+    def __init__(self, func="gelu", K=3, fit_samples=1000):
+        super().__init__()
+        if func not in SUPPORTED_FUNCS:
+            raise ValueError("Unsupported func=%r; choose from %s" % (func, SUPPORTED_FUNCS))
+
+        self.func = func
+        self.K = K
+        self.num_subs = 1 << K
+        self.num_segs = 1 + 2 * NUM_OCTAVES * self.num_subs
+        self.pos_passthrough = CLAMP_CFG[func]["pos_passthrough"]
+
+        coeffs = _fit_coefficients(func, K, fit_samples)
+        self.register_buffer("coeffs", coeffs)
+
+        neg_cv = torch.tensor(CLAMP_CFG[func]["neg_clamp"], dtype=torch.float32)
+        pos_cv = torch.tensor(CLAMP_CFG[func]["pos_clamp"], dtype=torch.float32)
+        self.register_buffer("neg_clamp_val", neg_cv)
+        self.register_buffer("pos_clamp_val", pos_cv)
+
+    def forward(self, x):
+        if torch.onnx.is_in_onnx_export():
+            return PWPolyFFunction.apply(
+                x, self.coeffs, self.neg_clamp_val, self.pos_clamp_val,
+                self.func, self.K,
+            )
+
+        orig_shape = x.shape
+        x_flat = x.contiguous().view(-1)
+
+        seg_idx, is_neg_clamp, is_pos_clamp = _segment_index(
+            x_flat, self.K, self.num_subs, self.num_segs
+        )
+
+        c = self.coeffs[seg_idx]
+        a0 = c[:, 0]
+        a1 = c[:, 1]
+        a2 = c[:, 2]
+
+        # Horner: y = a0 + x*(a1 + a2*x)
+        y = a0 + x_flat * (a1 + a2 * x_flat)
+
+        if self.pos_passthrough:
+            pos_val = x_flat
+        else:
+            pos_val = self.pos_clamp_val.expand_as(y)
+        y = torch.where(is_pos_clamp, pos_val, y)
+        y = torch.where(is_neg_clamp, self.neg_clamp_val.expand_as(y), y)
+
+        return y.view(orig_shape)
diff --git a/tests/fpgadataflow/test_fpgadataflow_pwpolyf.py b/tests/fpgadataflow/test_fpgadataflow_pwpolyf.py
new file mode 100644
index 0000000000..5a892396ff
--- /dev/null
+++ b/tests/fpgadataflow/test_fpgadataflow_pwpolyf.py
@@ -0,0 +1,281 @@
+# Copyright (C) 2026, Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import pytest
+
+import numpy as np
+import os
+import tempfile
+import torch
+from onnx import TensorProto, helper
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.custom_op.registry import getCustomOp
+from qonnx.transformation.general import GiveUniqueNodeNames
+from qonnx.transformation.infer_datatypes import InferDataTypes
+from qonnx.transformation.infer_shapes import InferShapes
+
+import finn.core.onnx_exec as oxe
+from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
+from finn.util.pwpolyf import PiecewisePolyActivation
+from finn.transformation.fpgadataflow.convert_to_hw_layers import InferPWPolyFLayer
+from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers
+
+test_fpga_part = "xczu3eg-sbva484-1-e"
+
+
+def make_pwpolyf_modelwrapper(func, K, num_channels, num_input_vecs):
+    inp = helper.make_tensor_value_info(
+        "inp", TensorProto.FLOAT, num_input_vecs + [num_channels]
+    )
+    outp = helper.make_tensor_value_info(
+        "outp", TensorProto.FLOAT, num_input_vecs + [num_channels]
+    )
+
+    pwpolyf_node = helper.make_node(
+        "PWPolyF",
+        ["inp"],
+        ["outp"],
+        domain="finn.custom_op.fpgadataflow",
+        backend="fpgadataflow",
+        func=func,
+        K=K,
+        NumChannels=num_channels,
+        PE=1,
+        inputDataType="FLOAT32",
+        outputDataType="FLOAT32",
+        numInputVectors=num_input_vecs,
+        name="PWPolyF_0",
+    )
+
+    graph = helper.make_graph(
+        nodes=[pwpolyf_node],
+        name="pwpolyf_graph",
+        inputs=[inp],
+        outputs=[outp],
+    )
+    model = helper.make_model(graph, producer_name="pwpolyf-test")
+    model = ModelWrapper(model)
+    model = model.transform(InferShapes())
+    model = model.transform(InferDataTypes())
+    model = model.transform(GiveUniqueNodeNames())
+    return model
+
+
+@pytest.mark.parametrize("func", ["gelu", "silu", "sigmoid", "tanh"])
+@pytest.mark.parametrize("num_channels", [4, 16])
+@pytest.mark.parametrize("num_input_vecs", [[1], [1, 2, 2]])
+@pytest.mark.parametrize("fold", [-1, 1, 2])
+@pytest.mark.fpgadataflow
+def test_pwpolyf_cppsim(func, num_channels, num_input_vecs, fold):
+    K = 3
+    if fold == -1:
+        fold = num_channels
+    pe = num_channels // fold
+    if num_channels % pe != 0:
+        pytest.skip("Invalid folding configuration.")
+
+    model = make_pwpolyf_modelwrapper(func, K, num_channels, num_input_vecs)
+    node = model.graph.node[0]
+    inst = getCustomOp(node)
+    inst.set_nodeattr("PE", pe)
+
+    input_shape = tuple(num_input_vecs + [num_channels])
+    x = np.random.uniform(-10, 10, input_shape).astype(np.float32)
+
+    ref_mod = PiecewisePolyActivation(func, K=K)
+    with torch.no_grad():
+        y_expected = ref_mod(torch.from_numpy(x)).numpy()
+
+    input_dict = {"inp": x}
+    y_produced = oxe.execute_onnx(model, input_dict)["outp"]
+
+    assert y_produced.shape == y_expected.shape
+    assert np.allclose(y_produced, y_expected, atol=1e-6)
+
+
+@pytest.mark.parametrize("func", ["gelu", "silu", "sigmoid", "tanh"])
+@pytest.mark.fpgadataflow
+def test_pwpolyf_onnx_export(func):
+    K = 3
+    num_channels = 32
+    mod = PiecewisePolyActivation(func, K=K)
+    mod.eval()
+    dummy = torch.randn(1, num_channels)
+
+    with tempfile.NamedTemporaryFile(suffix=".onnx", delete=False) as f:
+        tmpf = f.name
+    try:
+        torch.onnx.export(
+            mod, dummy, tmpf,
+            input_names=["input"], output_names=["output"],
+            opset_version=13, dynamo=False,
+        )
+        import onnx
+        onnx_model = onnx.load(tmpf)
+    finally:
+        os.unlink(tmpf)
+
+    pwp_nodes = [n for n in onnx_model.graph.node if n.op_type == "PWPolyF"]
+    assert len(pwp_nodes) == 1
+    node = pwp_nodes[0]
+    func_attr = {a.name: a for a in node.attribute}
+    assert func_attr["func"].s.decode("utf-8") == func
+    assert func_attr["K"].i == K
+
+
+@pytest.mark.parametrize("func", ["gelu", "sigmoid"])
+@pytest.mark.fpgadataflow
+def test_pwpolyf_infer_transform(func):
+    K = 3
+    num_channels = 16
+    mod = PiecewisePolyActivation(func, K=K)
+    mod.eval()
+    dummy = torch.randn(1, num_channels)
+
+    with tempfile.NamedTemporaryFile(suffix=".onnx", delete=False) as f:
+        tmpf = f.name
+    try:
+        torch.onnx.export(
+            mod, dummy, tmpf,
+            input_names=["inp"], output_names=["outp"],
+            opset_version=13, dynamo=False,
+        )
+        model = ModelWrapper(tmpf)
+    finally:
+        os.unlink(tmpf)
+
+    node = model.graph.node[0]
+    assert node.op_type == "PWPolyF"
+    assert node.domain != "finn.custom_op.fpgadataflow"
+
+    model = model.transform(InferPWPolyFLayer())
+
+    node = model.graph.node[0]
+    assert node.op_type == "PWPolyF"
+    assert node.domain == "finn.custom_op.fpgadataflow"
+
+    inst = getCustomOp(node)
+    assert inst.get_nodeattr("func") == func
+    assert inst.get_nodeattr("K") == K
+    assert inst.get_nodeattr("NumChannels") == num_channels
+    assert inst.get_nodeattr("PE") == 1
+    assert inst.get_nodeattr("inputDataType") == "FLOAT32"
+
+    x = np.random.uniform(-5, 5, (1, num_channels)).astype(np.float32)
+    input_dict = {"inp": x}
+    y_produced = oxe.execute_onnx(model, input_dict)["outp"]
+
+    ref_mod = PiecewisePolyActivation(func, K=K)
+    with torch.no_grad():
+        y_expected = ref_mod(torch.from_numpy(x)).numpy()
+    assert np.allclose(y_produced, y_expected, atol=1e-6)
+
+
+@pytest.mark.parametrize("func", ["gelu", "silu", "sigmoid", "tanh"])
+@pytest.mark.fpgadataflow
+def test_pwpolyf_specialize_rtl(func):
+    K = 3
+    num_channels = 8
+    model = make_pwpolyf_modelwrapper(func, K, num_channels, [1])
+    model = model.transform(SpecializeLayers(test_fpga_part))
+
+    node = model.graph.node[0]
+    assert node.op_type == "PWPolyF_rtl"
+    assert node.domain == "finn.custom_op.fpgadataflow.rtl"
+
+    inst = getCustomOp(node)
+    assert inst.get_nodeattr("func") == func
+    assert inst.get_nodeattr("K") == K
+
+
+@pytest.mark.parametrize("func", ["gelu", "tanh"])
+@pytest.mark.parametrize("pe", [1, 2, 4])
+@pytest.mark.fpgadataflow
+def test_pwpolyf_resource_estimates(func, pe):
+    K = 3
+    num_channels = 8
+    model = make_pwpolyf_modelwrapper(func, K, num_channels, [1])
+    node = model.graph.node[0]
+    inst = getCustomOp(node)
+    inst.set_nodeattr("PE", pe)
+
+    assert inst.dsp_estimation() == 2 * pe
+    assert inst.lut_estimation() == 200 * pe
+    assert inst.bram_estimation() == 0
+    assert inst.uram_estimation() == 0
+
+
+@pytest.mark.parametrize("func", ["gelu", "sigmoid"])
+@pytest.mark.fpgadataflow
+def test_pwpolyf_folded_shape(func):
+    K = 3
+    num_channels = 12
+    num_input_vecs = [1, 3, 3]
+    model = make_pwpolyf_modelwrapper(func, K, num_channels, num_input_vecs)
+    node = model.graph.node[0]
+    inst = getCustomOp(node)
+
+    # PE=1
+    assert inst.get_normal_input_shape() == (1, 3, 3, 12)
+    assert inst.get_normal_output_shape() == (1, 3, 3, 12)
+    assert inst.get_folded_input_shape() == (1, 3, 3, 12, 1)
+    assert inst.get_folded_output_shape() == (1, 3, 3, 12, 1)
+
+    # PE=4
+    inst.set_nodeattr("PE", 4)
+    assert inst.get_folded_input_shape() == (1, 3, 3, 3, 4)
+    assert inst.get_folded_output_shape() == (1, 3, 3, 3, 4)
+    assert inst.get_instream_width() == 4 * 32
+    assert inst.get_outstream_width() == 4 * 32
+
+
+@pytest.mark.parametrize("func", ["gelu", "silu"])
+@pytest.mark.fpgadataflow
+def test_pwpolyf_exp_cycles(func):
+    """Verify expected cycle count estimation."""
+    K = 3
+    num_channels = 8
+    pe = 2
+    num_input_vecs = [1, 4, 4]
+    model = make_pwpolyf_modelwrapper(func, K, num_channels, num_input_vecs)
+    node = model.graph.node[0]
+    inst = getCustomOp(node)
+    inst.set_nodeattr("PE", pe)
+
+    # folded shape = (1, 4, 4, 4, 2), exp_cycles = prod of all but last = 1*4*4*4 = 64
+    exp = inst.get_exp_cycles()
+    assert exp == 1 * 4 * 4 * (num_channels // pe)
+
+    # exp_cycles_per_layer analysis only runs on specialized (rtl/hls) nodes
+    model = model.transform(SpecializeLayers(test_fpga_part))
+    node = model.graph.node[0]
+    inst = getCustomOp(node)
+    inst.set_nodeattr("PE", pe)
+    exp_dict = model.analysis(exp_cycles_per_layer)
+    assert node.name in exp_dict
+    assert exp_dict[node.name] == exp

From abecad30f9c2e2aa618a2841f8bfcf0abcf7c0e2 Mon Sep 17 00:00:00 2001
From: ollycassidy13 <ollyj.cassidy@gmail.com>
Date: Thu, 23 Apr 2026 09:04:03 +0100
Subject: [PATCH 02/12] nn.act detection and dynamo=True

---
 docs/finn/pwpolyf.md                          |  79 ++++--
 .../fpgadataflow/convert_to_hw_layers.py      | 182 ++++++++++--
 .../fpgadataflow/test_fpgadataflow_pwpolyf.py | 267 ++++++++++++++++++
 3 files changed, 491 insertions(+), 37 deletions(-)

diff --git a/docs/finn/pwpolyf.md b/docs/finn/pwpolyf.md
index c155470bae..a2d6544e04 100644
--- a/docs/finn/pwpolyf.md
+++ b/docs/finn/pwpolyf.md
@@ -13,7 +13,7 @@ K=3 this gives 81 segments. Segment selection reuses the FP32
 exponent/mantissa bit-fields directly, matching the RTL implementation.
 
 Polynomial coefficients are generated at HDL build time by
-`generate_coeffs_svh()` in `pwpolyf_sim.py`, which fits degree-2 polynomials
+`generate_coeffs_svh()` in `pwpolyf_rtl.py`, which fits degree-2 polynomials
 to the reference PyTorch functions and writes the `pwpolyf_coeffs.svh` header.
 This ensures the RTL coefficients always match the configured K value.
 
@@ -22,24 +22,53 @@ This ensures the RTL coefficients always match the configured K value.
 
 ## Architecture
 
-PWPolyF is **RTL-only** (no HLS variant). The pipeline is:
+PWPolyF is **RTL-only** (no HLS variant). Two export paths are supported:
 
 ```
-PiecewisePolyActivation (PyTorch)
-    |  torch.onnx.export (dynamo=False)
-    v
-PWPolyF ONNX node
-    |  InferPWPolyFLayer
-    v
-PWPolyF HW op (finn.custom_op.fpgadataflow)
-    |  SpecializeLayers
-    v
-PWPolyF_rtl (finn.custom_op.fpgadataflow.rtl)
-    |  generate_hdl
-    v
-finn-rtllib/pwpolyf/hdl/ SystemVerilog IP
+Path A: PiecewisePolyActivation        Path B: nn.GELU / nn.SiLU / etc.
+    |  torch.onnx.export                   |  torch.onnx.export
+    |  (dynamo=False)                      |  (dynamo=True or False)
+    v                                      v
+PWPolyF custom ONNX node           Standard ONNX ops (Gelu, Sigmoid,
+    |                               Tanh, Sigmoid+Mul for SiLU,
+    |                               Div+Erf+Add+Mul+Mul for GELU)
+    |                                      |
+    +------------- both paths -------------+
+                      |
+                InferPWPolyFLayer
+                      v
+            PWPolyF HW op (finn.custom_op.fpgadataflow)
+                      |  SpecializeLayers
+                      v
+            PWPolyF_rtl (finn.custom_op.fpgadataflow.rtl)
+                      |  generate_hdl
+                      v
+            finn-rtllib/pwpolyf/hdl/ SystemVerilog IP
 ```
 
+### Standard ONNX op inference
+
+`InferPWPolyFLayer` recognises standard ONNX activation ops in addition to
+the explicit `PWPolyF` custom op. This allows models that use `nn.GELU`,
+`nn.SiLU`, `nn.Sigmoid`, or `nn.Tanh` to be exported with `dynamo=True`
+(or `dynamo=False`) and automatically converted to PWPolyF HW layers.
+
+| ONNX op type | Pattern | Maps to |
+|---|---|---|
+| `Gelu` (opset 20+) | Single node | `func="gelu"` |
+| `Div`+`Erf`+`Add`+`Mul`+`Mul` | `x * 0.5 * (1 + erf(x / sqrt(2)))` | `func="gelu"` |
+| `Sigmoid` | Single node (standalone) | `func="sigmoid"` |
+| `Tanh` | Single node | `func="tanh"` |
+| `Sigmoid` + `Mul` | `Mul(x, Sigmoid(x))` | `func="silu"` |
+
+Notes:
+- `Gelu` as a single ONNX node requires opset 20 or later. With lower
+  opsets (including `dynamo=True` which defaults to opset 18), GELU
+  decomposes into a 5-node Erf-based pattern. Both forms are matched.
+- SiLU (`nn.SiLU`) has no standard ONNX op; it decomposes to
+  `Sigmoid(x) * x`. The transformation detects this two-node pattern.
+- Only FLOAT32 inputs are converted. Quantised activations are skipped.
+
 ## Folding
 
 PWPolyF uses PE parallelism. `NumChannels % PE == 0` must hold.
@@ -59,11 +88,17 @@ Each PE instantiates its own polynomial evaluation pipeline (2 DSPs).
 
 ## ONNX export
 
-`PiecewisePolyActivation` exports as a single `PWPolyF` custom op via
-`torch.autograd.Function.symbolic()`. Requires the legacy TorchScript exporter
-(`dynamo=False` in `torch.onnx.export`).
+Two export paths are supported:
+
+1. **`PiecewisePolyActivation` (explicit)** — exports as a single `PWPolyF`
+   custom op via `torch.autograd.Function.symbolic()`. Requires
+   `dynamo=False`. Preserves the `K` attribute on the ONNX node.
+
+2. **Standard nn modules** (`nn.GELU`, `nn.SiLU`, `nn.Sigmoid`, `nn.Tanh`) —
+   export with `dynamo=True` or `dynamo=False`. Produces standard ONNX ops
+   that `InferPWPolyFLayer` converts to PWPolyF with default `K=3`.
 
-Attributes on the ONNX node:
+Attributes on the explicit PWPolyF ONNX node:
 - `func` (string): one of `gelu`, `silu`, `sigmoid`, `tanh`
 - `K` (int): mantissa subdivision bits (default 3)
 
@@ -112,11 +147,15 @@ Attributes on the ONNX node:
 
 ## Tests
 
-`tests/fpgadataflow/test_fpgadataflow_pwpolyf.py` — 68 parametrized tests:
+`tests/fpgadataflow/test_fpgadataflow_pwpolyf.py`:
 
 - **cppsim**: all 4 functions x 2 channel counts x 2 spatial shapes x 3 foldings
 - **ONNX export**: verifies single-node export for all functions
 - **InferPWPolyFLayer**: end-to-end export → transform → execute
+- **Standard op inference**: Gelu/Sigmoid/Tanh single-node + SiLU pattern
+- **Erf-based GELU inference**: 5-node Erf decomposition pattern matching + execution
+- **SiLU edge cases**: reversed Mul input order, multi-consumer Sigmoid
+- **Execution correctness**: standard ops produce same output as PiecewisePolyActivation
 - **SpecializeLayers**: verifies RTL specialization
 - **Resource estimates**: DSP/LUT/BRAM checks across PE values
 - **Folded shapes**: input/output/stream width calculations
diff --git a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py
index c7e95b28bf..2427a4514a 100644
--- a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py
+++ b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py
@@ -275,17 +275,99 @@ def apply(self, model):
 
 
 class InferPWPolyFLayer(Transformation):
-    """Convert PWPolyF nodes into piecewise polynomial activation HW layers."""
+    """Convert PWPolyF custom ops and standard ONNX activations (Gelu, Sigmoid,
+    Tanh, SiLU pattern) into piecewise polynomial HW layers."""
+
+    _SINGLE_OP_MAP = {"Gelu": "gelu", "Tanh": "tanh"}
 
     def __init__(self):
         super().__init__()
 
+    @staticmethod
+    def _is_const_scalar(model, tensor_name, value, tol=1e-3):
+        """Check if *tensor_name* is a constant initializer equal to *value*."""
+        init = model.get_initializer(tensor_name)
+        if init is None:
+            return False
+        return init.size == 1 and abs(float(init.flat[0]) - value) < tol
+
+    def _match_erf_gelu(self, model, erf_node):
+        """Try to match the Erf-based GELU decomposition rooted at *erf_node*.
+
+        Pattern (opset < 20):
+            Div(x, sqrt(2)) → Erf → Add(_, 1) → Mul(0.5, _) → Mul(x, _)
+
+        Returns (pwp_input, pwp_output, nodes_to_remove) on success, else None.
+        """
+        # --- backward: Erf input must come from Div(x, sqrt(2)) ---
+        div_node = model.find_producer(erf_node.input[0])
+        if div_node is None or div_node.op_type != "Div":
+            return None
+        # one Div input is x, the other is sqrt(2) ≈ 1.4142
+        if self._is_const_scalar(model, div_node.input[1], 1.4142135):
+            gelu_input = div_node.input[0]
+        elif self._is_const_scalar(model, div_node.input[0], 1.4142135):
+            gelu_input = div_node.input[1]
+        else:
+            return None
+
+        # --- forward: Erf → Add(_, 1) ---
+        erf_consumers = model.find_consumers(erf_node.output[0])
+        if len(erf_consumers) != 1 or erf_consumers[0].op_type != "Add":
+            return None
+        add_node = erf_consumers[0]
+        other_add = [i for i in add_node.input if i != erf_node.output[0]]
+        if len(other_add) != 1 or not self._is_const_scalar(model, other_add[0], 1.0):
+            return None
+
+        # --- Add → Mul(0.5, _) ---
+        add_consumers = model.find_consumers(add_node.output[0])
+        if len(add_consumers) != 1 or add_consumers[0].op_type != "Mul":
+            return None
+        mul_half = add_consumers[0]
+        other_mul_half = [i for i in mul_half.input if i != add_node.output[0]]
+        if len(other_mul_half) != 1 or not self._is_const_scalar(model, other_mul_half[0], 0.5):
+            return None
+
+        # --- Mul(0.5,_) → Mul(x, _) ---
+        half_consumers = model.find_consumers(mul_half.output[0])
+        if len(half_consumers) != 1 or half_consumers[0].op_type != "Mul":
+            return None
+        mul_x = half_consumers[0]
+        other_mul_x = [i for i in mul_x.input if i != mul_half.output[0]]
+        if len(other_mul_x) != 1 or other_mul_x[0] != gelu_input:
+            return None
+
+        nodes_to_remove = [div_node, erf_node, add_node, mul_half, mul_x]
+        return (gelu_input, mul_x.output[0], nodes_to_remove)
+
+    @staticmethod
+    def _make_pwpolyf_node(pwp_input, pwp_output, func, in_shape, idt, name, K=3):
+        num_channels = in_shape[-1]
+        return helper.make_node(
+            "PWPolyF",
+            [pwp_input],
+            [pwp_output],
+            domain="finn.custom_op.fpgadataflow",
+            backend="fpgadataflow",
+            func=func,
+            K=K,
+            NumChannels=num_channels,
+            PE=1,
+            inputDataType=idt.name,
+            outputDataType=idt.name,
+            numInputVectors=list(in_shape[:-1]),
+            name=name,
+        )
+
     def apply(self, model):
         graph = model.graph
         node_ind = 0
         graph_modified = False
         for node in graph.node:
             node_ind += 1
+
+            # Case 1: PWPolyF custom op (dynamo=False export path)
             if node.op_type == "PWPolyF" and node.domain != "finn.custom_op.fpgadataflow":
                 pwp_input = node.input[0]
                 pwp_output = node.output[0]
@@ -296,28 +378,94 @@ def apply(self, model):
                 K_attr = get_by_name(node.attribute, "K")
                 K = K_attr.i if K_attr is not None else 3
 
-                num_channels = pwp_in_shape[-1]
-
-                new_node = helper.make_node(
-                    "PWPolyF",
-                    [pwp_input],
-                    [pwp_output],
-                    domain="finn.custom_op.fpgadataflow",
-                    backend="fpgadataflow",
-                    func=func,
-                    K=K,
-                    NumChannels=num_channels,
-                    PE=1,
-                    inputDataType=idt.name,
-                    outputDataType=idt.name,
-                    numInputVectors=list(pwp_in_shape[:-1]),
-                    name="PWPolyF_" + node.name,
+                new_node = self._make_pwpolyf_node(
+                    pwp_input, pwp_output, func, pwp_in_shape, idt,
+                    "PWPolyF_" + node.name, K,
                 )
+                graph.node.insert(node_ind, new_node)
+                graph.node.remove(node)
+                graph_modified = True
+
+            # Case 2: single-node standard ONNX activations (Gelu, Tanh)
+            elif node.op_type in self._SINGLE_OP_MAP:
+                pwp_input = node.input[0]
+                pwp_output = node.output[0]
+                pwp_in_shape = model.get_tensor_shape(pwp_input)
+                if pwp_in_shape is None or len(pwp_in_shape) < 1:
+                    continue
+                idt = model.get_tensor_datatype(pwp_input)
+                if idt != DataType["FLOAT32"]:
+                    continue
 
+                func = self._SINGLE_OP_MAP[node.op_type]
+                new_node = self._make_pwpolyf_node(
+                    pwp_input, pwp_output, func, pwp_in_shape, idt,
+                    "PWPolyF_" + node.name,
+                )
                 graph.node.insert(node_ind, new_node)
                 graph.node.remove(node)
                 graph_modified = True
 
+            # Case 3: Sigmoid — standalone or part of SiLU pattern
+            elif node.op_type == "Sigmoid":
+                sig_input = node.input[0]
+                sig_output = node.output[0]
+                pwp_in_shape = model.get_tensor_shape(sig_input)
+                if pwp_in_shape is None or len(pwp_in_shape) < 1:
+                    continue
+                idt = model.get_tensor_datatype(sig_input)
+                if idt != DataType["FLOAT32"]:
+                    continue
+
+                nodes_to_remove = [node]
+                func = "sigmoid"
+                pwp_output = sig_output
+
+                # Probe for SiLU: Sigmoid feeds a Mul whose other input
+                # is the same tensor x that enters the Sigmoid.
+                sig_consumers = model.find_consumers(sig_output)
+                if len(sig_consumers) == 1:
+                    mul_cand = sig_consumers[0]
+                    if mul_cand.op_type == "Mul":
+                        mul_inputs = list(mul_cand.input)
+                        other_idx = 1 if mul_inputs[0] == sig_output else 0
+                        if mul_inputs[other_idx] == sig_input:
+                            func = "silu"
+                            pwp_output = mul_cand.output[0]
+                            nodes_to_remove.append(mul_cand)
+
+                new_node = self._make_pwpolyf_node(
+                    sig_input, pwp_output, func, pwp_in_shape, idt,
+                    "PWPolyF_" + node.name,
+                )
+                graph.node.insert(node_ind, new_node)
+                for nd in nodes_to_remove:
+                    graph.node.remove(nd)
+                graph_modified = True
+
+            # Case 4: Erf-based GELU (dynamo=True / opset < 20)
+            # Div(x, sqrt(2)) → Erf → Add(_, 1) → Mul(0.5, _) → Mul(x, _)
+            elif node.op_type == "Erf":
+                match = self._match_erf_gelu(model, node)
+                if match is None:
+                    continue
+                pwp_input, pwp_output, nodes_to_remove = match
+                pwp_in_shape = model.get_tensor_shape(pwp_input)
+                if pwp_in_shape is None or len(pwp_in_shape) < 1:
+                    continue
+                idt = model.get_tensor_datatype(pwp_input)
+                if idt != DataType["FLOAT32"]:
+                    continue
+
+                new_node = self._make_pwpolyf_node(
+                    pwp_input, pwp_output, "gelu", pwp_in_shape, idt,
+                    "PWPolyF_" + node.name,
+                )
+                graph.node.insert(node_ind, new_node)
+                for nd in nodes_to_remove:
+                    graph.node.remove(nd)
+                graph_modified = True
+
         return (model, graph_modified)
 
 
diff --git a/tests/fpgadataflow/test_fpgadataflow_pwpolyf.py b/tests/fpgadataflow/test_fpgadataflow_pwpolyf.py
index 5a892396ff..e491d82eba 100644
--- a/tests/fpgadataflow/test_fpgadataflow_pwpolyf.py
+++ b/tests/fpgadataflow/test_fpgadataflow_pwpolyf.py
@@ -279,3 +279,270 @@ def test_pwpolyf_exp_cycles(func):
     exp_dict = model.analysis(exp_cycles_per_layer)
     assert node.name in exp_dict
     assert exp_dict[node.name] == exp
+
+
+# ---------- helpers for standard ONNX op inference tests ----------
+
+
+def make_standard_activation_model(op_type, num_channels, num_input_vecs):
+    """Build an ONNX model with a single standard activation op."""
+    shape = num_input_vecs + [num_channels]
+    inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, shape)
+    outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, shape)
+
+    act_node = helper.make_node(op_type, ["inp"], ["outp"], name=op_type + "_0")
+    graph = helper.make_graph([act_node], "test_graph", [inp], [outp])
+    model = helper.make_model(graph, producer_name="test")
+    model.opset_import[0].version = 20
+    model = ModelWrapper(model)
+    model = model.transform(InferShapes())
+    model = model.transform(InferDataTypes())
+    return model
+
+
+def make_silu_pattern_model(num_channels, num_input_vecs):
+    """Build ONNX model with Sigmoid + Mul pattern (SiLU)."""
+    shape = num_input_vecs + [num_channels]
+    inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, shape)
+    outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, shape)
+    sig_out = helper.make_tensor_value_info("sig_out", TensorProto.FLOAT, shape)
+
+    sigmoid_node = helper.make_node("Sigmoid", ["inp"], ["sig_out"], name="Sigmoid_0")
+    mul_node = helper.make_node("Mul", ["inp", "sig_out"], ["outp"], name="Mul_0")
+
+    graph = helper.make_graph(
+        [sigmoid_node, mul_node], "silu_graph", [inp], [outp],
+    )
+    model = helper.make_model(graph, producer_name="test")
+    model = ModelWrapper(model)
+    model.graph.value_info.append(sig_out)
+    model = model.transform(InferShapes())
+    model = model.transform(InferDataTypes())
+    return model
+
+
+def make_erf_gelu_model(num_channels, num_input_vecs):
+    """Build ONNX model with the Erf-based GELU decomposition.
+
+    Pattern: x * 0.5 * (1 + erf(x / sqrt(2)))
+    Nodes: Div(x, sqrt(2)) -> Erf -> Add(_, 1) -> Mul(0.5, _) -> Mul(x, _)
+    """
+    shape = num_input_vecs + [num_channels]
+    inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, shape)
+    outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, shape)
+
+    sqrt2 = helper.make_tensor("sqrt2", TensorProto.FLOAT, [], [np.float32(np.sqrt(2))])
+    one = helper.make_tensor("one", TensorProto.FLOAT, [], [np.float32(1.0)])
+    half = helper.make_tensor("half", TensorProto.FLOAT, [], [np.float32(0.5)])
+
+    div_node = helper.make_node("Div", ["inp", "sqrt2"], ["div_out"], name="Div_0")
+    erf_node = helper.make_node("Erf", ["div_out"], ["erf_out"], name="Erf_0")
+    add_node = helper.make_node("Add", ["erf_out", "one"], ["add_out"], name="Add_0")
+    mul_half_node = helper.make_node("Mul", ["half", "add_out"], ["mul_half_out"], name="Mul_0")
+    mul_x_node = helper.make_node("Mul", ["inp", "mul_half_out"], ["outp"], name="Mul_1")
+
+    graph = helper.make_graph(
+        [div_node, erf_node, add_node, mul_half_node, mul_x_node],
+        "erf_gelu_graph", [inp], [outp],
+        initializer=[sqrt2, one, half],
+    )
+    model = helper.make_model(graph, producer_name="test")
+    model = ModelWrapper(model)
+    model = model.transform(InferShapes())
+    model = model.transform(InferDataTypes())
+    return model
+
+
+# ---------- standard ONNX op inference tests ----------
+
+
+@pytest.mark.parametrize("op_type,expected_func", [
+    ("Gelu", "gelu"),
+    ("Sigmoid", "sigmoid"),
+    ("Tanh", "tanh"),
+])
+@pytest.mark.parametrize("num_channels", [4, 16])
+@pytest.mark.parametrize("num_input_vecs", [[1], [1, 2, 2]])
+@pytest.mark.fpgadataflow
+def test_pwpolyf_infer_standard_op(op_type, expected_func,
+                                   num_channels, num_input_vecs):
+    model = make_standard_activation_model(op_type, num_channels, num_input_vecs)
+
+    assert model.graph.node[0].op_type == op_type
+
+    model = model.transform(InferPWPolyFLayer())
+
+    assert len(model.graph.node) == 1
+    node = model.graph.node[0]
+    assert node.op_type == "PWPolyF"
+    assert node.domain == "finn.custom_op.fpgadataflow"
+
+    inst = getCustomOp(node)
+    assert inst.get_nodeattr("func") == expected_func
+    assert inst.get_nodeattr("K") == 3
+    assert inst.get_nodeattr("NumChannels") == num_channels
+    assert inst.get_nodeattr("PE") == 1
+    assert inst.get_nodeattr("inputDataType") == "FLOAT32"
+
+
+@pytest.mark.parametrize("num_channels", [4, 16])
+@pytest.mark.parametrize("num_input_vecs", [[1], [1, 2, 2]])
+@pytest.mark.fpgadataflow
+def test_pwpolyf_infer_silu_pattern(num_channels, num_input_vecs):
+    model = make_silu_pattern_model(num_channels, num_input_vecs)
+
+    assert len(model.graph.node) == 2
+    assert model.graph.node[0].op_type == "Sigmoid"
+    assert model.graph.node[1].op_type == "Mul"
+
+    model = model.transform(InferPWPolyFLayer())
+
+    assert len(model.graph.node) == 1
+    node = model.graph.node[0]
+    assert node.op_type == "PWPolyF"
+    assert node.domain == "finn.custom_op.fpgadataflow"
+
+    inst = getCustomOp(node)
+    assert inst.get_nodeattr("func") == "silu"
+    assert inst.get_nodeattr("K") == 3
+    assert inst.get_nodeattr("NumChannels") == num_channels
+
+
+@pytest.mark.fpgadataflow
+def test_pwpolyf_infer_silu_reversed_mul_inputs():
+    """SiLU detection works regardless of Mul input order."""
+    num_channels = 8
+    shape = [1, num_channels]
+    inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, shape)
+    outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, shape)
+    sig_out = helper.make_tensor_value_info("sig_out", TensorProto.FLOAT, shape)
+
+    sigmoid_node = helper.make_node("Sigmoid", ["inp"], ["sig_out"], name="Sigmoid_0")
+    mul_node = helper.make_node("Mul", ["sig_out", "inp"], ["outp"], name="Mul_0")
+
+    graph = helper.make_graph([sigmoid_node, mul_node], "silu_graph", [inp], [outp])
+    model = helper.make_model(graph, producer_name="test")
+    model = ModelWrapper(model)
+    model.graph.value_info.append(sig_out)
+    model = model.transform(InferShapes())
+    model = model.transform(InferDataTypes())
+
+    model = model.transform(InferPWPolyFLayer())
+
+    assert len(model.graph.node) == 1
+    inst = getCustomOp(model.graph.node[0])
+    assert inst.get_nodeattr("func") == "silu"
+
+
+@pytest.mark.fpgadataflow
+def test_pwpolyf_sigmoid_multi_consumer_no_silu():
+    """Sigmoid with multiple consumers becomes standalone sigmoid, not silu."""
+    num_channels = 8
+    shape = [1, num_channels]
+    inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, shape)
+    outp1 = helper.make_tensor_value_info("outp1", TensorProto.FLOAT, shape)
+    outp2 = helper.make_tensor_value_info("outp2", TensorProto.FLOAT, shape)
+    sig_out = helper.make_tensor_value_info("sig_out", TensorProto.FLOAT, shape)
+
+    sigmoid_node = helper.make_node("Sigmoid", ["inp"], ["sig_out"], name="Sigmoid_0")
+    mul_node = helper.make_node("Mul", ["inp", "sig_out"], ["outp1"], name="Mul_0")
+    identity_node = helper.make_node("Identity", ["sig_out"], ["outp2"], name="Id_0")
+
+    graph = helper.make_graph(
+        [sigmoid_node, mul_node, identity_node], "test_graph",
+        [inp], [outp1, outp2],
+    )
+    model = helper.make_model(graph, producer_name="test")
+    model = ModelWrapper(model)
+    model.graph.value_info.append(sig_out)
+    model = model.transform(InferShapes())
+    model = model.transform(InferDataTypes())
+
+    model = model.transform(InferPWPolyFLayer())
+
+    pwp_nodes = [n for n in model.graph.node if n.op_type == "PWPolyF"]
+    assert len(pwp_nodes) == 1
+    inst = getCustomOp(pwp_nodes[0])
+    assert inst.get_nodeattr("func") == "sigmoid"
+    # Mul and Identity should remain
+    assert any(n.op_type == "Mul" for n in model.graph.node)
+    assert any(n.op_type == "Identity" for n in model.graph.node)
+
+
+@pytest.mark.parametrize("op_type,expected_func", [
+    ("Gelu", "gelu"),
+    ("Sigmoid", "sigmoid"),
+    ("Tanh", "tanh"),
+])
+@pytest.mark.fpgadataflow
+def test_pwpolyf_standard_op_execution(op_type, expected_func):
+    num_channels = 16
+    model = make_standard_activation_model(op_type, num_channels, [1])
+    model = model.transform(InferPWPolyFLayer())
+
+    x = np.random.uniform(-5, 5, (1, num_channels)).astype(np.float32)
+    y_produced = oxe.execute_onnx(model, {"inp": x})["outp"]
+
+    ref_mod = PiecewisePolyActivation(expected_func, K=3)
+    with torch.no_grad():
+        y_expected = ref_mod(torch.from_numpy(x)).numpy()
+    assert np.allclose(y_produced, y_expected, atol=1e-6)
+
+
+@pytest.mark.fpgadataflow
+def test_pwpolyf_silu_pattern_execution():
+    num_channels = 16
+    model = make_silu_pattern_model(num_channels, [1])
+    model = model.transform(InferPWPolyFLayer())
+
+    x = np.random.uniform(-5, 5, (1, num_channels)).astype(np.float32)
+    y_produced = oxe.execute_onnx(model, {"inp": x})["outp"]
+
+    ref_mod = PiecewisePolyActivation("silu", K=3)
+    with torch.no_grad():
+        y_expected = ref_mod(torch.from_numpy(x)).numpy()
+    assert np.allclose(y_produced, y_expected, atol=1e-6)
+
+
+# ---------- Erf-based GELU inference tests ----------
+
+
+@pytest.mark.parametrize("num_channels", [4, 16])
+@pytest.mark.parametrize("num_input_vecs", [[1], [1, 2, 2]])
+@pytest.mark.fpgadataflow
+def test_pwpolyf_infer_erf_gelu_pattern(num_channels, num_input_vecs):
+    """Erf-based GELU decomposition (opset < 20) is converted to PWPolyF."""
+    model = make_erf_gelu_model(num_channels, num_input_vecs)
+
+    assert len(model.graph.node) == 5
+    assert model.graph.node[1].op_type == "Erf"
+
+    model = model.transform(InferPWPolyFLayer())
+
+    assert len(model.graph.node) == 1
+    node = model.graph.node[0]
+    assert node.op_type == "PWPolyF"
+    assert node.domain == "finn.custom_op.fpgadataflow"
+
+    inst = getCustomOp(node)
+    assert inst.get_nodeattr("func") == "gelu"
+    assert inst.get_nodeattr("K") == 3
+    assert inst.get_nodeattr("NumChannels") == num_channels
+    assert inst.get_nodeattr("PE") == 1
+    assert inst.get_nodeattr("inputDataType") == "FLOAT32"
+
+
+@pytest.mark.fpgadataflow
+def test_pwpolyf_erf_gelu_execution():
+    """Erf-based GELU produces same output as PiecewisePolyActivation."""
+    num_channels = 16
+    model = make_erf_gelu_model(num_channels, [1])
+    model = model.transform(InferPWPolyFLayer())
+
+    x = np.random.uniform(-5, 5, (1, num_channels)).astype(np.float32)
+    y_produced = oxe.execute_onnx(model, {"inp": x})["outp"]
+
+    ref_mod = PiecewisePolyActivation("gelu", K=3)
+    with torch.no_grad():
+        y_expected = ref_mod(torch.from_numpy(x)).numpy()
+    assert np.allclose(y_produced, y_expected, atol=1e-6)

From 752ecd0a1af2ed85d2a147c3def3b791595e5af9 Mon Sep 17 00:00:00 2001
From: ollycassidy13 <ollyj.cassidy@gmail.com>
Date: Thu, 23 Apr 2026 13:17:38 +0100
Subject: [PATCH 03/12] svh -> pkg and all k

---
 docs/finn/pwpolyf.md                          |  26 +-
 finn-rtllib/pwpolyf/hdl/pwpolyf.abc           |   5 +
 finn-rtllib/pwpolyf/hdl/pwpolyf.sv            | 164 +++-----
 finn-rtllib/pwpolyf/hdl/pwpolyf_coeffs.svh    | 344 ---------------
 finn-rtllib/pwpolyf/hdl/pwpolyf_pkg.sv        | 395 ++++++++++++++++++
 finn-rtllib/pwpolyf/hdl/pwpolyf_tb.sv         | 145 +++++++
 .../custom_op/fpgadataflow/rtl/pwpolyf_rtl.py |  71 +++-
 .../fpgadataflow/convert_to_hw_layers.py      |  23 +-
 8 files changed, 683 insertions(+), 490 deletions(-)
 create mode 100644 finn-rtllib/pwpolyf/hdl/pwpolyf.abc
 delete mode 100644 finn-rtllib/pwpolyf/hdl/pwpolyf_coeffs.svh
 create mode 100644 finn-rtllib/pwpolyf/hdl/pwpolyf_pkg.sv
 create mode 100644 finn-rtllib/pwpolyf/hdl/pwpolyf_tb.sv

diff --git a/docs/finn/pwpolyf.md b/docs/finn/pwpolyf.md
index a2d6544e04..cd8510a7ef 100644
--- a/docs/finn/pwpolyf.md
+++ b/docs/finn/pwpolyf.md
@@ -3,9 +3,12 @@
 ## Overview
 
 PWPolyF is a hardware activation layer that approximates nonlinear functions
-(GELU, SiLU, Sigmoid, Tanh) using degree-2 piecewise polynomials. Each segment
-is evaluated via Horner's method on two cascaded DSPFP32 FMA units, giving
-single-cycle-per-element throughput with no BRAM usage.
+(GELU, SiLU, Sigmoid, Tanh) using piecewise polynomials evaluated via Horner's
+method on a chain of DSPFP32 FMA units. With the default degree 2, this uses
+two cascaded DSPs per PE, giving single-cycle-per-element throughput with no
+BRAM usage. Per-function configuration (clamping behaviour and polynomial
+coefficients) is delivered through a SystemVerilog package (`pwpolyf_pkg`)
+using a `func_cfg_t` struct.
 
 The input domain is partitioned into `1 + 2*5*(2^K)` segments: one near-zero
 region, positive octave sub-segments, and negative mirrors. With the default
@@ -13,12 +16,11 @@ K=3 this gives 81 segments. Segment selection reuses the FP32
 exponent/mantissa bit-fields directly, matching the RTL implementation.
 
 Polynomial coefficients are generated at HDL build time by
-`generate_coeffs_svh()` in `pwpolyf_rtl.py`, which fits degree-2 polynomials
-to the reference PyTorch functions and writes the `pwpolyf_coeffs.svh` header.
-This ensures the RTL coefficients always match the configured K value.
-
-> **Note:** The RTL currently only supports K=3. Support for other K values
-> is planned for a future update to `pwpolyf.sv`.
+`generate_coeffs_pkg()` in `pwpolyf_rtl.py`, which fits degree-2 polynomials
+to the reference PyTorch functions and writes `pwpolyf_pkg.sv` — a
+SystemVerilog package with one `func_cfg_t` struct per activation
+(clamping config + coefficient table). K can take any value; it defaults
+to 3 when inferred from standard ONNX ops.
 
 ## Architecture
 
@@ -130,7 +132,7 @@ Attributes on the explicit PWPolyF ONNX node:
 | File | Purpose |
 |------|---------|
 | `custom_op/fpgadataflow/pwpolyf.py` | Base HW op (shape, folding, resource estimates, cppsim) |
-| `custom_op/fpgadataflow/rtl/pwpolyf_rtl.py` | RTL backend (HDL generation, coefficient SVH generation, rtlsim, IPI) |
+| `custom_op/fpgadataflow/rtl/pwpolyf_rtl.py` | RTL backend (HDL generation, package generation, rtlsim, IPI) |
 | `util/pwpolyf.py` | PyTorch activation module, ONNX export, software simulation |
 | `transformation/fpgadataflow/convert_to_hw_layers.py` | `InferPWPolyFLayer` transformation |
 | `builder/build_dataflow_steps.py` | Build pipeline integration |
@@ -140,8 +142,8 @@ Attributes on the explicit PWPolyF ONNX node:
 
 | File | Purpose |
 |------|---------|
-| `finn-rtllib/pwpolyf/hdl/pwpolyf.sv` | Core polynomial evaluation pipeline |
-| `finn-rtllib/pwpolyf/hdl/pwpolyf_coeffs.svh` | Default K=3 coefficients (regenerated at build time) |
+| `finn-rtllib/pwpolyf/hdl/pwpolyf_pkg.sv` | `func_cfg_t` struct per activation (coeffs + clamp config, regenerated per K) |
+| `finn-rtllib/pwpolyf/hdl/pwpolyf.sv` | Polynomial evaluation pipeline (Horner chain on DSPFP32) |
 | `finn-rtllib/pwpolyf/hdl/queue.sv` | Elastic FIFO for backpressure |
 | `finn-rtllib/pwpolyf/hdl/pwpolyf_template_wrapper.v` | AXI-Stream wrapper template |
 
diff --git a/finn-rtllib/pwpolyf/hdl/pwpolyf.abc b/finn-rtllib/pwpolyf/hdl/pwpolyf.abc
new file mode 100644
index 0000000000..06b77b967d
--- /dev/null
+++ b/finn-rtllib/pwpolyf/hdl/pwpolyf.abc
@@ -0,0 +1,5 @@
+import  queue
+read_sv pwpolyf_pkg.sv
+read_sv pwpolyf.sv
+setup_tb  pwpolyf_tb
+setup_top pwpolyf
\ No newline at end of file
diff --git a/finn-rtllib/pwpolyf/hdl/pwpolyf.sv b/finn-rtllib/pwpolyf/hdl/pwpolyf.sv
index 51196a9db6..a2257fe17f 100644
--- a/finn-rtllib/pwpolyf/hdl/pwpolyf.sv
+++ b/finn-rtllib/pwpolyf/hdl/pwpolyf.sv
@@ -10,20 +10,21 @@
  * @description
  *	Supports GELU, SiLU, Sigmoid, and Tanh via `parameter string FUNC`.
  *
- *	Approximated by piecewise degree-2 polynomials over segments defined
- *	by FP32 bit-extraction.  Evaluated via Horner's method on a chain of
- *	2 DSPFP32 instances, each computing FMA: out = C + A*B.
+ *	Approximated by piecewise degree-D polynomials over segments defined
+ *	by FP32 bit-extraction, where D = DEGREE from pwpolyf_pkg.
+ *	Evaluated via Horner's method on a chain
+ *	of D DSPFP32 instances, each computing FMA: out = C + A*B.
  *
- *	Horner: y = a_0 + x*(a_1 + a_2*x)
- *	  Stage 0: out = a_1 + a_2 * x        (A=coeff[2], B=x, C=coeff[1])
- *	  Stage 1: out = a_0 + prev * x       (A=prev,     B=x, C=coeff[0])
+ *	Horner (degree D): y = a_0 + x*(a_1 + x*(... + x*a_D))
+ *	  Stage 0: out = a_{D-1} + a_D * x
+ *	  Stage j: out = a_{D-1-j} + prev * x   (j = 1 .. D-1)
  *
  *	Clamping for |x| >= 8 (5 octaves):
  *	  GELU/SiLU:  neg -> 0,   pos -> x  (pass-through)
  *	  Sigmoid:    neg -> 0,   pos -> 1.0
  *	  Tanh:       neg -> -1,  pos -> 1.0
  *
- *	Latency: 8 cycles (2 DSP stages x 4 cycles each).  II=1.
+ *	Latency: D * DSP_LAT cycles (D DSP stages x 4 cycles each).  II=1.
  ***************************************************************************/
 
 //===----------------------------------------------------------------------===//
@@ -47,9 +48,9 @@ module pwpolyf_dspfp32 (
 	//  FPOPMODE[1:0] = 01 (FP mode enable)
 	localparam logic [6:0]  MODE_FMA = 7'b00_110_01;
 
-	logic  invalid;
-	logic  overflow;
-	logic  underflow;
+	uwire  invalid;
+	uwire  overflow;
+	uwire  underflow;
 
 	DSPFP32 #(
 		.A_FPTYPE("B32"),
@@ -119,7 +120,7 @@ endmodule : pwpolyf_dspfp32
 
 //===----------------------------------------------------------------------===//
 // Full PE-wide streaming activation with piecewise polynomial approximation.
-// Hardcoded for DEGREE=2 from pwpolyf_coeffs.svh.
+// Degree D derived from DEGREE in pwpolyf_pkg.
 //===----------------------------------------------------------------------===//
 module pwpolyf #(
 	int unsigned  PE = 1,
@@ -140,18 +141,15 @@ module pwpolyf #(
 	input	logic  yrdy
 );
 
-	`include "pwpolyf_coeffs.svh"
+	import pwpolyf_pkg::*;
 
-	localparam int unsigned  K           = PWPOLYF_K;
-	localparam int unsigned  NUM_SEGS    = PWPOLYF_NUM_SEGS;
 	localparam int unsigned  NUM_SUBS    = 1 << K;
-	localparam int unsigned  NUM_OCTAVES = PWPOLYF_NUM_OCTAVES;
 	localparam int unsigned  DSP_LAT     = 4;
-	localparam int unsigned  LATENCY     = 2 * DSP_LAT;  // DEGREE=2
+	localparam int unsigned  LATENCY     = DEGREE * DSP_LAT;
 
 	initial begin
-		assert(PWPOLYF_DEGREE == 2) else begin
-			$error("%m: This implementation requires PWPOLYF_DEGREE == 2.");
+		assert(DEGREE >= 1) else begin
+			$error("%m: DEGREE must be >= 1.");
 			$finish;
 		end
 		assert(FUNC == "gelu" || FUNC == "silu" || FUNC == "sigmoid" || FUNC == "tanh") else begin
@@ -160,20 +158,12 @@ module pwpolyf #(
 		end
 	end
 
-	//=== Per-activation clamping parameters ==================================
-	localparam logic [31:0]  NEG_CLAMP_VAL =
-		FUNC == "tanh" ? 32'hBF800000 : 32'h00000000;  // tanh: -1.0, else: 0.0
-	localparam logic [31:0]  POS_CLAMP_VAL =
-		(FUNC == "sigmoid" || FUNC == "tanh") ? 32'h3F800000 : 32'h00000000;  // sigmoid/tanh: 1.0
-	localparam bit  POS_PASSTHROUGH =
-		(FUNC == "gelu" || FUNC == "silu") ? 1 : 0;  // gelu/silu: output=x
-
-	//=== Coefficient selection ===============================================
-	localparam logic [31:0]  COEFFS[NUM_SEGS][3] =
-		FUNC == "gelu"    ? PWPOLYF_GELU_COEFFS :
-		FUNC == "silu"    ? PWPOLYF_SILU_COEFFS :
-		FUNC == "sigmoid" ? PWPOLYF_SIGMOID_COEFFS :
-		                    PWPOLYF_TANH_COEFFS;
+	//=== Per-activation configuration =======================================
+	localparam func_cfg_t  CFG =
+		FUNC == "gelu"    ? GELU :
+		FUNC == "silu"    ? SILU :
+		FUNC == "sigmoid" ? SIGMOID :
+		                    TANH;
 
 	//=== Clamping exponent threshold =========================================
 	localparam int unsigned  EXP_CLAMP = 130;  // |x| >= 8.0
@@ -214,7 +204,7 @@ module pwpolyf #(
 	uwire [PE-1:0]  rvld_vec;
 	uwire  rvld;
 
-	for(genvar  pe = 0; pe < PE; pe++) begin : genPE
+	for(genvar  pe = 0; pe < PE; pe++) begin : gen_pe
 		uwire [31:0]  xi = x_cur[pe];
 
 		//--- Segment selector (combinational) --------------------------------
@@ -232,22 +222,17 @@ module pwpolyf #(
 
 		// Segment index for ROM lookup
 		uwire [6:0]  seg_idx;
-		if(1) begin : blkSegIdx
+		if(1) begin : blk_seg_idx
 			uwire [6:0]  pos_idx = 7'd1 + {1'b0, octave, sub};
 			uwire [6:0]  neg_idx = 7'(7'd1 + NUM_SUBS * NUM_OCTAVES) + {1'b0, octave, sub};
 			assign	seg_idx = is_near_zero? 7'd0 :
 			                  sign? neg_idx : pos_idx;
-		end : blkSegIdx
+		end : blk_seg_idx
 
-		//--- Coefficient lookup (combinational) ------------------------------
-		uwire [31:0]  coeff_a0 = COEFFS[seg_idx][0];
-		uwire [31:0]  coeff_a1 = COEFFS[seg_idx][1];
-		uwire [31:0]  coeff_a2 = COEFFS[seg_idx][2];
-
-		//--- Horner chain: 2 stages of pwpolyf_dspfp32 ----------------------
-		// Stage 0: s0 = a1 + a2 * x   (latency: 4 cycles)
-		// Stage 1: s1 = a0 + s0 * x   (latency: 4 cycles)
-		// Total: 8 cycles
+		//--- Horner chain: DEGREE stages of pwpolyf_dspfp32 ------------------
+		// Stage 0: s[0] = coeff[DEGREE-1] + coeff[DEGREE] * x
+		// Stage j: s[j] = coeff[DEGREE-1-j] + s[j-1] * x_delayed
+		// Total: DEGREE * DSP_LAT cycles
 
 		// Valid pipeline
 		logic [LATENCY-1:0]  Vld = '0;
@@ -257,57 +242,42 @@ module pwpolyf #(
 		end
 		assign	rvld_vec[pe] = Vld[$left(Vld)];
 
-		// Delay x by 4 cycles for stage 1 input
-		logic [31:0]  Xd1 = 'x;
-		logic [31:0]  Xd2 = 'x;
-		logic [31:0]  Xd3 = 'x;
-		logic [31:0]  Xd4 = 'x;
-		always_ff @(posedge clk) begin
-			Xd1 <= xi;
-			Xd2 <= Xd1;
-			Xd3 <= Xd2;
-			Xd4 <= Xd3;
-		end
-
-		// Delay x by 8 cycles for pass-through on positive clamp
-		logic [31:0]  Xd5 = 'x;
-		logic [31:0]  Xd6 = 'x;
-		logic [31:0]  Xd7 = 'x;
-		logic [31:0]  Xd8 = 'x;
-		always_ff @(posedge clk) begin
-			Xd5 <= Xd4;
-			Xd6 <= Xd5;
-			Xd7 <= Xd6;
-			Xd8 <= Xd7;
-		end
-
-		// Delay a0 by 4 cycles for stage 1 C input
-		logic [31:0]  C0d1 = 'x;
-		logic [31:0]  C0d2 = 'x;
-		logic [31:0]  C0d3 = 'x;
-		logic [31:0]  C0d4 = 'x;
+		// Delay x for DSP B inputs and pass-through clamp
+		logic [31:0]  XDly[LATENCY] = '{default: 'x};
 		always_ff @(posedge clk) begin
-			C0d1 <= coeff_a0;
-			C0d2 <= C0d1;
-			C0d3 <= C0d2;
-			C0d4 <= C0d3;
+			XDly[0] <= xi;
+			for(int i = 1; i < LATENCY; i++)
+				XDly[i] <= XDly[i-1];
 		end
 
-		// Stage 0: s0 = coeff_a1 + coeff_a2 * xi
-		uwire [31:0]  s0;
-		pwpolyf_dspfp32 dsp0 (
-			.clk, .rst,
-			.a(coeff_a2), .b(xi), .c(coeff_a1),
-			.r(s0), .rvld(Vld[3])
-		);
-
-		// Stage 1: s1 = a0_delayed + s0 * x_delayed
-		uwire [31:0]  s1;
-		pwpolyf_dspfp32 dsp1 (
-			.clk, .rst,
-			.a(s0), .b(Xd4), .c(C0d4),
-			.r(s1), .rvld(Vld[7])
-		);
+		// DSP chain
+		uwire [31:0]  s[DEGREE];
+
+		for(genvar  j = 0; j < DEGREE; j++) begin : genDSP
+			uwire [31:0]  dsp_a = (j == 0)? CFG.coeffs[seg_idx][DEGREE] : s[j-1];
+			uwire [31:0]  dsp_b = (j == 0)? xi : XDly[j*DSP_LAT - 1];
+
+			// C input: coeff[DEGREE-1-j] delayed by j*DSP_LAT cycles
+			logic [31:0]  dsp_c;
+			if(j == 0) begin : genCdir
+				assign  dsp_c = CFG.coeffs[seg_idx][DEGREE-1];
+			end : genCdir
+			else begin : genCdly
+				logic [31:0]  CDly[j*DSP_LAT] = '{default: 'x};
+				always_ff @(posedge clk) begin
+					CDly[0] <= CFG.coeffs[seg_idx][DEGREE-1-j];
+					for(int i = 1; i < j*DSP_LAT; i++)
+						CDly[i] <= CDly[i-1];
+				end
+				assign  dsp_c = CDly[j*DSP_LAT - 1];
+			end : genCdly
+
+			pwpolyf_dspfp32  dsp (
+				.clk, .rst,
+				.a(dsp_a), .b(dsp_b), .c(dsp_c),
+				.r(s[j]), .rvld(Vld[(j+1)*DSP_LAT - 1])
+			);
+		end : genDSP
 
 		//--- Clamp mux -------------------------------------------------------
 		logic [LATENCY-1:0]  NegClamp = '0;
@@ -324,11 +294,11 @@ module pwpolyf #(
 		end
 
 		// Output mux
-		assign	r[pe] = NegClamp[$left(NegClamp)]? NEG_CLAMP_VAL :
-		                 PosClamp[$left(PosClamp)]? (POS_PASSTHROUGH? Xd8 : POS_CLAMP_VAL) :
-		                 s1;
+		assign	r[pe] = NegClamp[$left(NegClamp)]? CFG.neg_clamp :
+		                 PosClamp[$left(PosClamp)]? (CFG.pos_passthrough? XDly[LATENCY-1] : CFG.pos_clamp) :
+		                 s[DEGREE-1];
 
-	end : genPE
+	end : gen_pe
 
 	// All PE results should be valid simultaneously
 	assign	rvld = rvld_vec[0];
@@ -353,4 +323,4 @@ module pwpolyf #(
 		end
 	end
 
-endmodule : pwpolyf
+endmodule : pwpolyf
\ No newline at end of file
diff --git a/finn-rtllib/pwpolyf/hdl/pwpolyf_coeffs.svh b/finn-rtllib/pwpolyf/hdl/pwpolyf_coeffs.svh
deleted file mode 100644
index 4783a69a8c..0000000000
--- a/finn-rtllib/pwpolyf/hdl/pwpolyf_coeffs.svh
+++ /dev/null
@@ -1,344 +0,0 @@
-// Auto-generated by pwpolyf_sim.py — do not edit manually.
-// K=3, NUM_SEGS=81, NUM_OCTAVES=5, DEGREE=2
-
-localparam int unsigned PWPOLYF_K          = 3;
-localparam int unsigned PWPOLYF_NUM_SEGS   = 81;
-localparam int unsigned PWPOLYF_NUM_OCTAVES = 5;
-localparam int unsigned PWPOLYF_DEGREE     = 2;
-
-localparam logic [31:0] PWPOLYF_GELU_COEFFS[81][3] = '{
-    '{ 32'h37B98E70, 32'h3F000000, 32'h3ECA71FE },  // seg 0
-    '{ 32'hBA7ADBC7, 32'h3F0278C5, 32'h3EBE3708 },  // seg 1
-    '{ 32'hBAC1FAC3, 32'h3F036C36, 32'h3EBAD62D },  // seg 2
-    '{ 32'hBB0F7119, 32'h3F049537, 32'h3EB7205C },  // seg 3
-    '{ 32'hBB4C3199, 32'h3F05F665, 32'h3EB31D7A },  // seg 4
-    '{ 32'hBB8CE270, 32'h3F0793DC, 32'h3EAECF5D },  // seg 5
-    '{ 32'hBBBD42FF, 32'h3F096FF1, 32'h3EAA3BDC },  // seg 6
-    '{ 32'hBBF86AD7, 32'h3F0B8C93, 32'h3EA5686A },  // seg 7
-    '{ 32'hBC1FE938, 32'h3F0DEDAD, 32'h3EA05544 },  // seg 8
-    '{ 32'hBC61967B, 32'h3F11FAE4, 32'h3E985544 },  // seg 9
-    '{ 32'hBCA9F9E2, 32'h3F1853F6, 32'h3E8D0DA0 },  // seg 10
-    '{ 32'hBCF4024D, 32'h3F1FBA03, 32'h3E81380E },  // seg 11
-    '{ 32'hBD283275, 32'h3F281F23, 32'h3E6A0515 },  // seg 12
-    '{ 32'hBD6012DC, 32'h3F316E8A, 32'h3E5131A8 },  // seg 13
-    '{ 32'hBD90E65A, 32'h3F3B8AB1, 32'h3E384EA9 },  // seg 14
-    '{ 32'hBDB69B12, 32'h3F46505E, 32'h3E1FAEE2 },  // seg 15
-    '{ 32'hBDE0E236, 32'h3F519680, 32'h3E07A0FB },  // seg 16
-    '{ 32'hBE13A447, 32'h3F62F98E, 32'h3DCA8920 },  // seg 17
-    '{ 32'hBE483C53, 32'h3F7A5DD5, 32'h3D6E8B3A },  // seg 18
-    '{ 32'hBE7FADD1, 32'h3F8848AA, 32'h3CC08378 },  // seg 19
-    '{ 32'hBE9AF2F6, 32'h3F9227E3, 32'hBB967C2C },  // seg 20
-    '{ 32'hBEB35A6C, 32'h3F9A4E1A, 32'hBCD3D2D3 },  // seg 21
-    '{ 32'hBEC733E2, 32'h3FA06D38, 32'hBD2655D9 },  // seg 22
-    '{ 32'hBED5117D, 32'h3FA46699, 32'hBD4ACA8A },  // seg 23
-    '{ 32'hBEDC221E, 32'h3FA64B98, 32'hBD5B0CC8 },  // seg 24
-    '{ 32'hBED977F6, 32'h3FA5A98F, 32'hBD563F50 },  // seg 25
-    '{ 32'hBEC1A7EF, 32'h3FA066C3, 32'hBD310A4D },  // seg 26
-    '{ 32'hBE9AF247, 32'h3F98AA16, 32'hBCFF13C8 },  // seg 27
-    '{ 32'hBE609014, 32'h3F90E5FD, 32'hBCA4952F },  // seg 28
-    '{ 32'hBE1465DA, 32'h3F8A89C8, 32'hBC412BC3 },  // seg 29
-    '{ 32'hBDB39147, 32'h3F860470, 32'hBBCFC6D8 },  // seg 30
-    '{ 32'hBD47BD32, 32'h3F832984, 32'hBB4E0A09 },  // seg 31
-    '{ 32'hBCCC65EE, 32'h3F8187F7, 32'hBABC9CBA },  // seg 32
-    '{ 32'hBC07A969, 32'h3F807817, 32'hB9D51508 },  // seg 33
-    '{ 32'hBABA43A4, 32'h3F8012B3, 32'hB870A727 },  // seg 34
-    '{ 32'hB93762D6, 32'h3F800216, 32'hB6C22359 },  // seg 35
-    '{ 32'h3411E06E, 32'h3F800000, 32'h27AB3551 },  // seg 36
-    '{ 32'h341E8FEE, 32'h3F800000, 32'h28A3B0E4 },  // seg 37
-    '{ 32'h342B3EFE, 32'h3F800000, 32'hA7CDB1C0 },  // seg 38
-    '{ 32'h3437EE42, 32'h3F800000, 32'hA8538CE4 },  // seg 39
-    '{ 32'h34449DB9, 32'h3F800000, 32'hA71AF986 },  // seg 40
-    '{ 32'hBA7AD37E, 32'h3EFB0E96, 32'h3EBE3747 },  // seg 41
-    '{ 32'hBAC20DB3, 32'h3EF92715, 32'h3EBAD556 },  // seg 42
-    '{ 32'hBB0F6B5B, 32'h3EF6D5D9, 32'h3EB720C8 },  // seg 43
-    '{ 32'hBB4C290B, 32'h3EF41395, 32'h3EB31DFE },  // seg 44
-    '{ 32'hBB8CE04D, 32'h3EF0D873, 32'h3EAECF95 },  // seg 45
-    '{ 32'hBBBD43D9, 32'h3EED2010, 32'h3EAA3BCD },  // seg 46
-    '{ 32'hBBF87DB4, 32'h3EE8E58C, 32'h3EA566F9 },  // seg 47
-    '{ 32'hBC1FE3E0, 32'h3EE42555, 32'h3EA055F8 },  // seg 48
-    '{ 32'hBC6197C8, 32'h3EDC0A12, 32'h3E98551F },  // seg 49
-    '{ 32'hBCA9FA8F, 32'h3ECF57EF, 32'h3E8D0D83 },  // seg 50
-    '{ 32'hBCF40310, 32'h3EC08BD5, 32'h3E8137F1 },  // seg 51
-    '{ 32'hBD2834D6, 32'h3EAFC0E3, 32'h3E6A03EA },  // seg 52
-    '{ 32'hBD6013D1, 32'h3E9D229D, 32'h3E513144 },  // seg 53
-    '{ 32'hBD90E5AD, 32'h3E88EB08, 32'h3E384F28 },  // seg 54
-    '{ 32'hBDB6985F, 32'h3E66C185, 32'h3E1FB08A },  // seg 55
-    '{ 32'hBDE0DF77, 32'h3E39A8DB, 32'h3E07A275 },  // seg 56
-    '{ 32'hBE13A40F, 32'h3DE8345F, 32'h3DCA8983 },  // seg 57
-    '{ 32'hBE483C8C, 32'h3CB44289, 32'h3D6E8AAF },  // seg 58
-    '{ 32'hBE7FAE75, 32'hBD848C95, 32'h3CC08085 },  // seg 59
-    '{ 32'hBE9AF25E, 32'hBE113D70, 32'hBB9669BA },  // seg 60
-    '{ 32'hBEB35AF3, 32'hBE527226, 32'hBCD3D632 },  // seg 61
-    '{ 32'hBEC73409, 32'hBE81B50C, 32'hBD265643 },  // seg 62
-    '{ 32'hBED511FB, 32'hBE919AF1, 32'hBD4ACBC1 },  // seg 63
-    '{ 32'hBEDC2198, 32'hBE992DD4, 32'hBD5B0BA5 },  // seg 64
-    '{ 32'hBED9784C, 32'hBE96A68E, 32'hBD563FE9 },  // seg 65
-    '{ 32'hBEC1A80B, 32'hBE819B22, 32'hBD310A73 },  // seg 66
-    '{ 32'hBE9AF281, 32'hBE45510D, 32'hBCFF1457 },  // seg 67
-    '{ 32'hBE60906A, 32'hBE073026, 32'hBCA4958A },  // seg 68
-    '{ 32'hBE146346, 32'hBDA8992A, 32'hBC41276D },  // seg 69
-    '{ 32'hBDB38EDC, 32'hBD408B15, 32'hBBCFC36C },  // seg 70
-    '{ 32'hBD47AFC4, 32'hBCCA5237, 32'hBB4DF9C3 },  // seg 71
-    '{ 32'hBCCC9FAA, 32'hBC443685, 32'hBABCD98B },  // seg 72
-    '{ 32'hBC07BF94, 32'hBB7057D9, 32'hB9D53AA2 },  // seg 73
-    '{ 32'hBABACF46, 32'hBA160B9E, 32'hB8715A94 },  // seg 74
-    '{ 32'hB93E544F, 32'hB88BBAC8, 32'hB6CD58B9 },  // seg 75
-    '{ 32'hB80D0FA0, 32'hB7425B86, 32'hB585D73F },  // seg 76
-    '{ 32'h00000000, 32'h00000000, 32'h00000000 },  // seg 77
-    '{ 32'h00000000, 32'h00000000, 32'h00000000 },  // seg 78
-    '{ 32'h00000000, 32'h00000000, 32'h00000000 },  // seg 79
-    '{ 32'h00000000, 32'h00000000, 32'h00000000 }  // seg 80
-};
-
-localparam logic [31:0] PWPOLYF_SILU_COEFFS[81][3] = '{
-    '{ 32'h36E95DF5, 32'h3F000000, 32'h3E7EDC5E },  // seg 0
-    '{ 32'hB99F1DCE, 32'h3F00C86D, 32'h3E771EC2 },  // seg 1
-    '{ 32'hB9F6B213, 32'h3F01162E, 32'h3E74F652 },  // seg 2
-    '{ 32'hBA36FFF7, 32'h3F017588, 32'h3E72946C },  // seg 3
-    '{ 32'hBA82DBFB, 32'h3F01E7EE, 32'h3E6FFB3C },  // seg 4
-    '{ 32'hBAB54FCB, 32'h3F026E5A, 32'h3E6D2ECC },  // seg 5
-    '{ 32'hBAF49B79, 32'h3F030A0C, 32'h3E6A30AE },  // seg 6
-    '{ 32'hBB212F9B, 32'h3F03BBB5, 32'h3E6704CC },  // seg 7
-    '{ 32'hBB50782F, 32'h3F048563, 32'h3E63A86F },  // seg 8
-    '{ 32'hBB945CE5, 32'h3F05E1D4, 32'h3E5E4873 },  // seg 9
-    '{ 32'hBBE25E30, 32'h3F080BF2, 32'h3E569788 },  // seg 10
-    '{ 32'hBC24C259, 32'h3F0A9F94, 32'h3E4E59B3 },  // seg 11
-    '{ 32'hBC66B42F, 32'h3F0D9E75, 32'h3E45A38E },  // seg 12
-    '{ 32'hBC9C5244, 32'h3F11080E, 32'h3E3C8A8D },  // seg 13
-    '{ 32'hBCCE04C3, 32'h3F14DA5B, 32'h3E3322D2 },  // seg 14
-    '{ 32'hBD04800C, 32'h3F191096, 32'h3E298289 },  // seg 15
-    '{ 32'hBD26E43B, 32'h3F1DA640, 32'h3E1FBAD7 },  // seg 16
-    '{ 32'hBD637E9F, 32'h3F252125, 32'h3E10F462 },  // seg 17
-    '{ 32'hBDA33B9D, 32'h3F301FA3, 32'h3DFAD009 },  // seg 18
-    '{ 32'hBDDE6B32, 32'h3F3BF5F1, 32'h3DD4EBDE },  // seg 19
-    '{ 32'hBE1121CB, 32'h3F484C5B, 32'h3DB103B4 },  // seg 20
-    '{ 32'hBE369CC7, 32'h3F54CBA1, 32'h3D8FABD4 },  // seg 21
-    '{ 32'hBE5EB1E1, 32'h3F612225, 32'h3D6290D3 },  // seg 22
-    '{ 32'hBE842968, 32'h3F6D086D, 32'h3D2C2202 },  // seg 23
-    '{ 32'hBE99340A, 32'h3F7842D6, 32'h3CF862F4 },  // seg 24
-    '{ 32'hBEB7B0F6, 32'h3F83AB48, 32'h3C810FDC },  // seg 25
-    '{ 32'hBEDD21FD, 32'h3F8C031A, 32'h3AA0895B },  // seg 26
-    '{ 32'hBEFBE1D7, 32'h3F922E5A, 32'hBC0A6628 },  // seg 27
-    '{ 32'hBF0931A3, 32'h3F9649B5, 32'hBC6A59DD },  // seg 28
-    '{ 32'hBF101E8F, 32'h3F989B85, 32'hBC8E0AB0 },  // seg 29
-    '{ 32'hBF12EEED, 32'h3F997B24, 32'hBC96B85F },  // seg 30
-    '{ 32'hBF121CE8, 32'h3F994071, 32'hBC94AB86 },  // seg 31
-    '{ 32'hBF0E4CE0, 32'h3F983CFA, 32'hBC8C0C06 },  // seg 32
-    '{ 32'hBF047881, 32'h3F95D0C8, 32'hBC71DF34 },  // seg 33
-    '{ 32'hBEE597C0, 32'h3F91E386, 32'hBC3A0377 },  // seg 34
-    '{ 32'hBEBEB59D, 32'h3F8DFF08, 32'hBC081E5A },  // seg 35
-    '{ 32'hBE994535, 32'h3F8A965C, 32'hBBC0C1D9 },  // seg 36
-    '{ 32'hBE700CF7, 32'h3F87CFE4, 32'hBB856F55 },  // seg 37
-    '{ 32'hBE37FE86, 32'h3F85A6F4, 32'hBB35A189 },  // seg 38
-    '{ 32'hBE0A8F49, 32'h3F8406CD, 32'hBAF420DA },  // seg 39
-    '{ 32'hBDCD8C89, 32'h3F82D4E6, 32'hBAA265F3 },  // seg 40
-    '{ 32'hB99F15B4, 32'h3EFE6F36, 32'h3E771F07 },  // seg 41
-    '{ 32'hB9F6E275, 32'h3EFDD355, 32'h3E74F54A },  // seg 42
-    '{ 32'hBA370764, 32'h3EFD14DB, 32'h3E729434 },  // seg 43
-    '{ 32'hBA82CBD2, 32'h3EFC307E, 32'h3E6FFC3C },  // seg 44
-    '{ 32'hBAB519A0, 32'h3EFB2461, 32'h3E6D318E },  // seg 45
-    '{ 32'hBAF49002, 32'h3EF9EC20, 32'h3E6A3138 },  // seg 46
-    '{ 32'hBB213894, 32'h3EF88848, 32'h3E670422 },  // seg 47
-    '{ 32'hBB509289, 32'h3EF6F461, 32'h3E63A6AF },  // seg 48
-    '{ 32'hBB946026, 32'h3EF43C28, 32'h3E5E481D },  // seg 49
-    '{ 32'hBBE25D58, 32'h3EEFE827, 32'h3E56979C },  // seg 50
-    '{ 32'hBC24C311, 32'h3EEAC0C7, 32'h3E4E599C },  // seg 51
-    '{ 32'hBC66B0C8, 32'h3EE4C363, 32'h3E45A3FC },  // seg 52
-    '{ 32'hBC9C5250, 32'h3EDDEFE3, 32'h3E3C8A8A },  // seg 53
-    '{ 32'hBCCE0263, 32'h3ED64BA6, 32'h3E33233F },  // seg 54
-    '{ 32'hBD0483B9, 32'h3ECDDDD0, 32'h3E29816B },  // seg 55
-    '{ 32'hBD26E21A, 32'h3EC4B40E, 32'h3E1FBB6C },  // seg 56
-    '{ 32'hBD637E94, 32'h3EB5BDB7, 32'h3E10F463 },  // seg 57
-    '{ 32'hBDA33BD9, 32'h3E9FC0A2, 32'h3DFACFDF },  // seg 58
-    '{ 32'hBDDE6AB7, 32'h3E88144E, 32'h3DD4EC2A },  // seg 59
-    '{ 32'hBE11213C, 32'h3E5ECF5B, 32'h3DB10440 },  // seg 60
-    '{ 32'hBE369BA8, 32'h3E2CD2EE, 32'h3D8FACC1 },  // seg 61
-    '{ 32'hBE5EB20C, 32'h3DF6EE82, 32'h3D6290A4 },  // seg 62
-    '{ 32'hBE8428DF, 32'h3D97BEFB, 32'h3D2C2354 },  // seg 63
-    '{ 32'hBE9933B8, 32'h3CF7AA94, 32'h3CF8644F },  // seg 64
-    '{ 32'hBEB7B09B, 32'hBCEACCA1, 32'h3C811126 },  // seg 65
-    '{ 32'hBEDD2230, 32'hBDC0323A, 32'h3AA081A5 },  // seg 66
-    '{ 32'hBEFBE177, 32'hBE11723B, 32'hBC0A645F },  // seg 67
-    '{ 32'hBF093217, 32'hBE324EF2, 32'hBC6A5D77 },  // seg 68
-    '{ 32'hBF101F44, 32'hBE44DDF5, 32'hBC8E0CF8 },  // seg 69
-    '{ 32'hBF12EEFF, 32'hBE4BD952, 32'hBC96B899 },  // seg 70
-    '{ 32'hBF121E42, 32'hBE4A0685, 32'hBC94AED3 },  // seg 71
-    '{ 32'hBF0E4E2A, 32'hBE41EA78, 32'hBC8C0EC0 },  // seg 72
-    '{ 32'hBF047922, 32'hBE2E876E, 32'hBC71E175 },  // seg 73
-    '{ 32'hBEE5994E, 32'hBE0F1D79, 32'hBC3A059D },  // seg 74
-    '{ 32'hBEBEB455, 32'hBDDFEE7F, 32'hBC081CCC },  // seg 75
-    '{ 32'hBE9948EA, 32'hBDA96AD6, 32'hBBC0C8E9 },  // seg 76
-    '{ 32'hBE701045, 32'hBD7A00B1, 32'hBB8571EB },  // seg 77
-    '{ 32'hBE3805EB, 32'hBD34E735, 32'hBB35ABED },  // seg 78
-    '{ 32'hBE0A9538, 32'hBD00E03E, 32'hBAF42F6A },  // seg 79
-    '{ 32'hBDCD9AF0, 32'hBCB54853, 32'hBAA2755A }  // seg 80
-};
-
-localparam logic [31:0] PWPOLYF_SIGMOID_COEFFS[81][3] = '{
-    '{ 32'h3F000000, 32'h3E7F33B4, 32'hB21FFF88 },  // seg 0
-    '{ 32'h3EFFCF27, 32'h3E822CCD, 32'hBC84C1F2 },  // seg 1
-    '{ 32'h3EFFBC74, 32'h3E82B1D2, 32'hBC938C36 },  // seg 2
-    '{ 32'h3EFFA5B5, 32'h3E834361, 32'hBCA21BEF },  // seg 3
-    '{ 32'h3EFF8A9F, 32'h3E83E0E0, 32'hBCB06BD4 },  // seg 4
-    '{ 32'h3EFF6B53, 32'h3E8487A2, 32'hBCBE4EDE },  // seg 5
-    '{ 32'h3EFF47DE, 32'h3E853610, 32'hBCCBB848 },  // seg 6
-    '{ 32'h3EFF1FD8, 32'h3E85ED00, 32'hBCD8C92D },  // seg 7
-    '{ 32'h3EFEF3E7, 32'h3E86A89A, 32'hBCE54D90 },  // seg 8
-    '{ 32'h3EFEA7FC, 32'h3E87D4B7, 32'hBCF7D81E },  // seg 9
-    '{ 32'h3EFE3434, 32'h3E897082, 32'hBD075E74 },  // seg 10
-    '{ 32'h3EFDB0A7, 32'h3E8B15AC, 32'hBD11E821 },  // seg 11
-    '{ 32'h3EFD1FFF, 32'h3E8CBAB3, 32'hBD1B7B94 },  // seg 12
-    '{ 32'h3EFC85A0, 32'h3E8E568F, 32'hBD2411C8 },  // seg 13
-    '{ 32'h3EFBE681, 32'h3E8FDE72, 32'hBD2B9C7D },  // seg 14
-    '{ 32'h3EFB46C8, 32'h3E914BCF, 32'hBD322448 },  // seg 15
-    '{ 32'h3EFAAB5A, 32'h3E9297AA, 32'hBD37AD8F },  // seg 16
-    '{ 32'h3EF9DBB3, 32'h3E9432BF, 32'hBD3E0993 },  // seg 17
-    '{ 32'h3EF909EB, 32'h3E95A9C9, 32'hBD43470B },  // seg 18
-    '{ 32'h3EF8B573, 32'h3E9632E0, 32'hBD450418 },  // seg 19
-    '{ 32'h3EF909F7, 32'h3E95B9B4, 32'hBD43A89C },  // seg 20
-    '{ 32'h3EFA2B65, 32'h3E943959, 32'hBD3FAB94 },  // seg 21
-    '{ 32'h3EFC33ED, 32'h3E91B9FE, 32'hBD3988A9 },  // seg 22
-    '{ 32'h3EFF34ED, 32'h3E8E4C2C, 32'hBD31B440 },  // seg 23
-    '{ 32'h3F01997B, 32'h3E8A0ACB, 32'hBD28A198 },  // seg 24
-    '{ 32'h3F057A8B, 32'h3E8262C5, 32'hBD198434 },  // seg 25
-    '{ 32'h3F0C3064, 32'h3E6CECF8, 32'hBD0452A9 },  // seg 26
-    '{ 32'h3F1447E1, 32'h3E530782, 32'hBCDF318F },  // seg 27
-    '{ 32'h3F1D4A68, 32'h3E38CE80, 32'hBCB905FB },  // seg 28
-    '{ 32'h3F26C195, 32'h3E1F8C72, 32'hBC9750E8 },  // seg 29
-    '{ 32'h3F3044A9, 32'h3E081DC6, 32'hBC74E58A },  // seg 30
-    '{ 32'h3F3982E1, 32'h3DE5F174, 32'hBC448461 },  // seg 31
-    '{ 32'h3F424019, 32'h3DC09FBD, 32'hBC1CAB6F },  // seg 32
-    '{ 32'h3F4DF478, 32'h3D924A5B, 32'hBBDD9BF8 },  // seg 33
-    '{ 32'h3F5B2AC7, 32'h3D464E25, 32'hBB897CAF },  // seg 34
-    '{ 32'h3F656FAC, 32'h3D045E36, 32'hBB291861 },  // seg 35
-    '{ 32'h3F6D25C5, 32'h3CAEBA7A, 32'hBACED519 },  // seg 36
-    '{ 32'h3F72CA96, 32'h3C64B5A7, 32'hBA7C2B6D },  // seg 37
-    '{ 32'h3F76D7D9, 32'h3C14B4A4, 32'hBA196B60 },  // seg 38
-    '{ 32'h3F79B538, 32'h3BC060EC, 32'hB9BA7A5A },  // seg 39
-    '{ 32'h3F7BB5E3, 32'h3B77B8AD, 32'hB9626A80 },  // seg 40
-    '{ 32'h3F001880, 32'h3E822DEE, 32'h3C84E3F8 },  // seg 41
-    '{ 32'h3F0021CE, 32'h3E82B23F, 32'h3C939805 },  // seg 42
-    '{ 32'h3F002D26, 32'h3E834368, 32'h3CA21C3C },  // seg 43
-    '{ 32'h3F003A99, 32'h3E83DFE3, 32'h3CB05650 },  // seg 44
-    '{ 32'h3F004A09, 32'h3E84848C, 32'h3CBE0FC2 },  // seg 45
-    '{ 32'h3F005BFD, 32'h3E853553, 32'h3CCBAA1A },  // seg 46
-    '{ 32'h3F007027, 32'h3E85EDAA, 32'h3CD8D518 },  // seg 47
-    '{ 32'h3F008689, 32'h3E86AC9A, 32'h3CE58F3C },  // seg 48
-    '{ 32'h3F00AC00, 32'h3E87D4A7, 32'h3CF7D732 },  // seg 49
-    '{ 32'h3F00E601, 32'h3E89713C, 32'h3D076365 },  // seg 50
-    '{ 32'h3F0127C8, 32'h3E8B1652, 32'h3D11EC06 },  // seg 51
-    '{ 32'h3F016FFD, 32'h3E8CBAA0, 32'h3D1B7B31 },  // seg 52
-    '{ 32'h3F01BD0C, 32'h3E8E55D8, 32'h3D240E23 },  // seg 53
-    '{ 32'h3F020CD3, 32'h3E8FDED1, 32'h3D2B9E4A },  // seg 54
-    '{ 32'h3F025CCD, 32'h3E914CAB, 32'h3D322819 },  // seg 55
-    '{ 32'h3F02AA33, 32'h3E929729, 32'h3D37AB80 },  // seg 56
-    '{ 32'h3F031236, 32'h3E9432FB, 32'h3D3E0A78 },  // seg 57
-    '{ 32'h3F037AF9, 32'h3E95A98C, 32'h3D43463B },  // seg 58
-    '{ 32'h3F03A527, 32'h3E96327F, 32'h3D4502ED },  // seg 59
-    '{ 32'h3F037B01, 32'h3E95B9AA, 32'h3D43A880 },  // seg 60
-    '{ 32'h3F02EA67, 32'h3E94399C, 32'h3D3FAC3D },  // seg 61
-    '{ 32'h3F01E5FE, 32'h3E91B9E3, 32'h3D39886A },  // seg 62
-    '{ 32'h3F00659B, 32'h3E8E4C54, 32'h3D31B499 },  // seg 63
-    '{ 32'h3EFCCCD9, 32'h3E8A0A9A, 32'h3D28A135 },  // seg 64
-    '{ 32'h3EF50AC9, 32'h3E8262A6, 32'h3D1983F9 },  // seg 65
-    '{ 32'h3EE79F36, 32'h3E6CECF3, 32'h3D0452A4 },  // seg 66
-    '{ 32'h3ED77017, 32'h3E530747, 32'h3CDF3133 },  // seg 67
-    '{ 32'h3EC56B50, 32'h3E38CEAF, 32'h3CB9063C },  // seg 68
-    '{ 32'h3EB27DCB, 32'h3E1F8DAD, 32'h3C97527C },  // seg 69
-    '{ 32'h3E9F76E9, 32'h3E081E0B, 32'h3C74E62F },  // seg 70
-    '{ 32'h3E8CFAEA, 32'h3DE5F2F3, 32'h3C448608 },  // seg 71
-    '{ 32'h3E770090, 32'h3DC0A0B8, 32'h3C1CAC71 },  // seg 72
-    '{ 32'h3E482E7E, 32'h3D924AB4, 32'h3BDD9CA1 },  // seg 73
-    '{ 32'h3E13552B, 32'h3D464E9B, 32'h3B897D12 },  // seg 74
-    '{ 32'h3DD481AF, 32'h3D045D7B, 32'h3B29173F },  // seg 75
-    '{ 32'h3D96D28F, 32'h3CAEBB74, 32'h3ACED672 },  // seg 76
-    '{ 32'h3D5358A4, 32'h3C64B82E, 32'h3A7C2E99 },  // seg 77
-    '{ 32'h3D1283F5, 32'h3C14B667, 32'h3A196D6B },  // seg 78
-    '{ 32'h3CC95645, 32'h3BC05DE7, 32'h39BA76F4 },  // seg 79
-    '{ 32'h3C8947BE, 32'h3B77C111, 32'h39627305 }  // seg 80
-};
-
-localparam logic [31:0] PWPOLYF_TANH_COEFFS[81][3] = '{
-    '{ 32'h24C775B8, 32'h3F7CD991, 32'hA73006D1 },  // seg 0
-    '{ 32'hBBAC00F6, 32'h3F87D4AF, 32'hBE77D79E },  // seg 1
-    '{ 32'hBBE5F686, 32'h3F8970F2, 32'hBE87616C },  // seg 2
-    '{ 32'hBC13E04D, 32'h3F8B1626, 32'hBE91EAFF },  // seg 3
-    '{ 32'hBC38062F, 32'h3F8CBAF6, 32'hBE9B7D12 },  // seg 4
-    '{ 32'hBC5E87C9, 32'h3F8E55E8, 32'hBEA40E6C },  // seg 5
-    '{ 32'hBC833183, 32'h3F8FDE94, 32'hBEAB9D26 },  // seg 6
-    '{ 32'hBC97304B, 32'h3F914C74, 32'hBEB22725 },  // seg 7
-    '{ 32'hBCAA905C, 32'h3F929762, 32'hBEB7AC6C },  // seg 8
-    '{ 32'hBCC48DB5, 32'h3F9432FE, 32'hBEBE0A82 },  // seg 9
-    '{ 32'hBCDEBF5B, 32'h3F95A99C, 32'hBEC34673 },  // seg 10
-    '{ 32'hBCE94C8C, 32'h3F9632A2, 32'hBEC50358 },  // seg 11
-    '{ 32'hBCDEBE2C, 32'h3F95B992, 32'hBEC3A83E },  // seg 12
-    '{ 32'hBCBA9ED9, 32'h3F9439CF, 32'hBEBFACBF },  // seg 13
-    '{ 32'hBC72F4C3, 32'h3F91B9B3, 32'hBEB987F9 },  // seg 14
-    '{ 32'hBB4B4F04, 32'h3F8E4C70, 32'hBEB1B4D9 },  // seg 15
-    '{ 32'h3C4CBE96, 32'h3F8A0AC9, 32'hBEA8A196 },  // seg 16
-    '{ 32'h3D2F5351, 32'h3F8262A8, 32'hBE9983FC },  // seg 17
-    '{ 32'h3DC30600, 32'h3F6CED04, 32'hBE8452B2 },  // seg 18
-    '{ 32'h3E223FEB, 32'h3F53072C, 32'hBE5F310C },  // seg 19
-    '{ 32'h3E6A52FA, 32'h3F38CE9A, 32'hBE39061F },  // seg 20
-    '{ 32'h3E9B0431, 32'h3F1F8DD2, 32'hBE1752AC },  // seg 21
-    '{ 32'h3EC1122B, 32'h3F081E0D, 32'hBDF4E633 },  // seg 22
-    '{ 32'h3EE60A55, 32'h3EE5F2C5, 32'hBDC485D6 },  // seg 23
-    '{ 32'h3F047F8C, 32'h3EC0A114, 32'hBD9CACD1 },  // seg 24
-    '{ 32'h3F1BE8C6, 32'h3E924AAB, 32'hBD5D9C90 },  // seg 25
-    '{ 32'h3F365584, 32'h3E464E44, 32'hBD097CC9 },  // seg 26
-    '{ 32'h3F4ADF8C, 32'h3E045D95, 32'hBCA91767 },  // seg 27
-    '{ 32'h3F5A4B6D, 32'h3DAEBB15, 32'hBC4ED5EF },  // seg 28
-    '{ 32'h3F6594EE, 32'h3D64B815, 32'hBBFC2E78 },  // seg 29
-    '{ 32'h3F6DAF91, 32'h3D14B5D0, 32'hBB996CB5 },  // seg 30
-    '{ 32'h3F736A9F, 32'h3CC05DAA, 32'hBB3A76B3 },  // seg 31
-    '{ 32'h3F776B54, 32'h3C77C747, 32'hBAE2796C },  // seg 32
-    '{ 32'h3F7B290A, 32'h3C00F553, 32'hBA590596 },  // seg 33
-    '{ 32'h3F7DD423, 32'h3B51CDAD, 32'hB99FBCDD },  // seg 34
-    '{ 32'h3F7F0B0E, 32'h3AA91301, 32'hB8EB151C },  // seg 35
-    '{ 32'h3F7F95A3, 32'h3A073EF5, 32'hB82D041E },  // seg 36
-    '{ 32'h3F7FD274, 32'h3956AD85, 32'hB77E46D5 },  // seg 37
-    '{ 32'h3F7FECAF, 32'h38A9A5DF, 32'hB6BB164C },  // seg 38
-    '{ 32'h3F7FF7E0, 32'h3805A163, 32'hB609E0DD },  // seg 39
-    '{ 32'h3F7FFCA1, 32'h37505C2F, 32'hB549DC3A },  // seg 40
-    '{ 32'h3BAC00F6, 32'h3F87D4AF, 32'h3E77D79E },  // seg 41
-    '{ 32'h3BE5F686, 32'h3F8970F2, 32'h3E87616C },  // seg 42
-    '{ 32'h3C13E04D, 32'h3F8B1626, 32'h3E91EAFF },  // seg 43
-    '{ 32'h3C38062F, 32'h3F8CBAF6, 32'h3E9B7D12 },  // seg 44
-    '{ 32'h3C5E87C9, 32'h3F8E55E8, 32'h3EA40E6C },  // seg 45
-    '{ 32'h3C833183, 32'h3F8FDE94, 32'h3EAB9D26 },  // seg 46
-    '{ 32'h3C97304B, 32'h3F914C74, 32'h3EB22725 },  // seg 47
-    '{ 32'h3CAA905C, 32'h3F929762, 32'h3EB7AC6C },  // seg 48
-    '{ 32'h3CC48DB5, 32'h3F9432FE, 32'h3EBE0A82 },  // seg 49
-    '{ 32'h3CDEBF5B, 32'h3F95A99C, 32'h3EC34673 },  // seg 50
-    '{ 32'h3CE94C8C, 32'h3F9632A2, 32'h3EC50358 },  // seg 51
-    '{ 32'h3CDEBE2C, 32'h3F95B992, 32'h3EC3A83E },  // seg 52
-    '{ 32'h3CBA9ED9, 32'h3F9439CF, 32'h3EBFACBF },  // seg 53
-    '{ 32'h3C72F4C3, 32'h3F91B9B3, 32'h3EB987F9 },  // seg 54
-    '{ 32'h3B4B4F04, 32'h3F8E4C70, 32'h3EB1B4D9 },  // seg 55
-    '{ 32'hBC4CBE96, 32'h3F8A0AC9, 32'h3EA8A196 },  // seg 56
-    '{ 32'hBD2F5351, 32'h3F8262A8, 32'h3E9983FC },  // seg 57
-    '{ 32'hBDC30600, 32'h3F6CED04, 32'h3E8452B2 },  // seg 58
-    '{ 32'hBE223FEB, 32'h3F53072C, 32'h3E5F310C },  // seg 59
-    '{ 32'hBE6A52FA, 32'h3F38CE9A, 32'h3E39061F },  // seg 60
-    '{ 32'hBE9B0431, 32'h3F1F8DD2, 32'h3E1752AC },  // seg 61
-    '{ 32'hBEC1122B, 32'h3F081E0D, 32'h3DF4E633 },  // seg 62
-    '{ 32'hBEE60A55, 32'h3EE5F2C5, 32'h3DC485D6 },  // seg 63
-    '{ 32'hBF047F8C, 32'h3EC0A114, 32'h3D9CACD1 },  // seg 64
-    '{ 32'hBF1BE8C6, 32'h3E924AAB, 32'h3D5D9C90 },  // seg 65
-    '{ 32'hBF365584, 32'h3E464E44, 32'h3D097CC9 },  // seg 66
-    '{ 32'hBF4ADF8C, 32'h3E045D95, 32'h3CA91767 },  // seg 67
-    '{ 32'hBF5A4B6D, 32'h3DAEBB15, 32'h3C4ED5EF },  // seg 68
-    '{ 32'hBF6594EE, 32'h3D64B815, 32'h3BFC2E78 },  // seg 69
-    '{ 32'hBF6DAF91, 32'h3D14B5D0, 32'h3B996CB5 },  // seg 70
-    '{ 32'hBF736A9F, 32'h3CC05DAA, 32'h3B3A76B3 },  // seg 71
-    '{ 32'hBF776B54, 32'h3C77C747, 32'h3AE2796C },  // seg 72
-    '{ 32'hBF7B290A, 32'h3C00F553, 32'h3A590596 },  // seg 73
-    '{ 32'hBF7DD423, 32'h3B51CDAD, 32'h399FBCDD },  // seg 74
-    '{ 32'hBF7F0B0E, 32'h3AA91301, 32'h38EB151C },  // seg 75
-    '{ 32'hBF7F95A3, 32'h3A073EF5, 32'h382D041E },  // seg 76
-    '{ 32'hBF7FD274, 32'h3956AD85, 32'h377E46D5 },  // seg 77
-    '{ 32'hBF7FECAF, 32'h38A9A5DF, 32'h36BB164C },  // seg 78
-    '{ 32'hBF7FF7E0, 32'h3805A163, 32'h3609E0DD },  // seg 79
-    '{ 32'hBF7FFCA1, 32'h37505C2F, 32'h3549DC3A }  // seg 80
-};
-
diff --git a/finn-rtllib/pwpolyf/hdl/pwpolyf_pkg.sv b/finn-rtllib/pwpolyf/hdl/pwpolyf_pkg.sv
new file mode 100644
index 0000000000..2838f03fe2
--- /dev/null
+++ b/finn-rtllib/pwpolyf/hdl/pwpolyf_pkg.sv
@@ -0,0 +1,395 @@
+/****************************************************************************
+ * Copyright (C) 2026, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * @brief	Per-activation configuration for pwpolyf.
+ * @author	Shane Fleming <shane.fleming@amd.com>
+ *
+ * @description
+ *	Package consolidating shared approximation constants and per-activation
+ *	configuration (clamping parameters + coefficient arrays) for the
+ *	piecewise polynomial activation unit.
+ *
+ *	Coefficient data auto-generated by pwpolyf_coeffs.py -- DEGREE=2 K=3
+ *	NUM_OCTAVES=5.  Segments: 81  Coefficients per segment: 3
+ *	Polynomial: y = a_0 + a_1*x + a_2*x^2 + ... + a_d*x^d
+ *	Horner form: y = a_0 + x*(a_1 + x*(a_2 + ... x*a_d))
+ *
+ *	Segment index encoding:
+ *	  0                       = near-zero (|x| < 0.25)
+ *	  1 .. 5*2^K              = positive octaves (exp 125..129)
+ *	  5*2^K+1 .. end          = negative octaves (exp 125..129)
+ ***************************************************************************/
+package pwpolyf_pkg;
+
+	localparam int unsigned  DEGREE      = 2;
+	localparam int unsigned  K           = 3;
+	localparam int unsigned  NUM_OCTAVES = 5;
+	localparam int unsigned  NUM_SEGS    = 81;
+
+	typedef struct {
+		logic [31:0]  neg_clamp;
+		logic [31:0]  pos_clamp;
+		bit           pos_passthrough;
+		logic [31:0]  coeffs[NUM_SEGS][DEGREE+1];
+	} func_cfg_t;
+
+	localparam func_cfg_t  GELU = '{
+		neg_clamp:       32'h00000000,
+		pos_clamp:       32'h00000000,
+		pos_passthrough: 1,
+		coeffs: '{
+			'{ 32'h37B92D98, 32'h3F000000, 32'h3ECA7276 },	// [0] near_zero: [-0.2500, 0.2500)
+			'{ 32'hBA7AD2E9, 32'h3F0278B4, 32'h3EBE374D },	// [1] pos_oct0_sub0: [0.2500, 0.2812)
+			'{ 32'hBAC20AB3, 32'h3F036C6B, 32'h3EBAD579 },	// [2] pos_oct0_sub1: [0.2812, 0.3125)
+			'{ 32'hBB0F6AA9, 32'h3F04950F, 32'h3EB720D6 },	// [3] pos_oct0_sub2: [0.3125, 0.3438)
+			'{ 32'hBB4C32E1, 32'h3F05F66C, 32'h3EB31D68 },	// [4] pos_oct0_sub3: [0.3438, 0.3750)
+			'{ 32'hBB8CE121, 32'h3F0793CF, 32'h3EAECF81 },	// [5] pos_oct0_sub4: [0.3750, 0.4062)
+			'{ 32'hBBBD44F5, 32'h3F097002, 32'h3EAA3BB6 },	// [6] pos_oct0_sub5: [0.4062, 0.4375)
+			'{ 32'hBBF87F2B, 32'h3F0B8D46, 32'h3EA566DF },	// [7] pos_oct0_sub6: [0.4375, 0.4688)
+			'{ 32'hBC1FE374, 32'h3F0DED4E, 32'h3EA05607 },	// [8] pos_oct0_sub7: [0.4688, 0.5000)
+			'{ 32'hBC6196C5, 32'h3F11FAE7, 32'h3E985541 },	// [9] pos_oct1_sub0: [0.5000, 0.5625)
+			'{ 32'hBCA9FA9B, 32'h3F185408, 32'h3E8D0D86 },	// [10] pos_oct1_sub1: [0.5625, 0.6250)
+			'{ 32'hBCF402A2, 32'h3F1FBA0A, 32'h3E813805 },	// [11] pos_oct1_sub2: [0.6250, 0.6875)
+			'{ 32'hBD28346B, 32'h3F281F7A, 32'h3E6A0426 },	// [12] pos_oct1_sub3: [0.6875, 0.7500)
+			'{ 32'hBD6013B5, 32'h3F316EAB, 32'h3E513158 },	// [13] pos_oct1_sub4: [0.7500, 0.8125)
+			'{ 32'hBD90E5E4, 32'h3F3B8A8B, 32'h3E384F09 },	// [14] pos_oct1_sub5: [0.8125, 0.8750)
+			'{ 32'hBDB698C5, 32'h3F464FB9, 32'h3E1FB052 },	// [15] pos_oct1_sub6: [0.8750, 0.9375)
+			'{ 32'hBDE0DF89, 32'h3F5195CD, 32'h3E07A270 },	// [16] pos_oct1_sub7: [0.9375, 1.0000)
+			'{ 32'hBE13A45A, 32'h3F62F993, 32'h3DCA8921 },	// [17] pos_oct2_sub0: [1.0000, 1.1250)
+			'{ 32'hBE483CB7, 32'h3F7A5DFA, 32'h3D6E8A6E },	// [18] pos_oct2_sub1: [1.1250, 1.2500)
+			'{ 32'hBE7FAE79, 32'h3F8848C8, 32'h3CC080D0 },	// [19] pos_oct2_sub2: [1.2500, 1.3750)
+			'{ 32'hBE9AF2A1, 32'h3F9227C4, 32'hBB9670CC },	// [20] pos_oct2_sub3: [1.3750, 1.5000)
+			'{ 32'hBEB35AF6, 32'h3F9A4E45, 32'hBCD3D62F },	// [21] pos_oct2_sub4: [1.5000, 1.6250)
+			'{ 32'hBEC733B2, 32'h3FA06D28, 32'hBD26553D },	// [22] pos_oct2_sub5: [1.6250, 1.7500)
+			'{ 32'hBED511F1, 32'h3FA466BA, 32'hBD4ACBAB },	// [23] pos_oct2_sub6: [1.7500, 1.8750)
+			'{ 32'hBEDC2153, 32'h3FA64B63, 32'hBD5B0B08 },	// [24] pos_oct2_sub7: [1.8750, 2.0000)
+			'{ 32'hBED97943, 32'h3FA5A9DF, 32'hBD5641AE },	// [25] pos_oct3_sub0: [2.0000, 2.2500)
+			'{ 32'hBEC1A84E, 32'h3FA066D7, 32'hBD310ADA },	// [26] pos_oct3_sub1: [2.2500, 2.5000)
+			'{ 32'hBE9AF232, 32'h3F98AA14, 32'hBCFF13CB },	// [27] pos_oct3_sub2: [2.5000, 2.7500)
+			'{ 32'hBE609A95, 32'h3F90E6E9, 32'hBCA49F87 },	// [28] pos_oct3_sub3: [2.7500, 3.0000)
+			'{ 32'hBE145DCE, 32'h3F8A8925, 32'hBC411EC8 },	// [29] pos_oct3_sub4: [3.0000, 3.2500)
+			'{ 32'hBDB37CB4, 32'h3F8603AD, 32'hBBCFAA27 },	// [30] pos_oct3_sub5: [3.2500, 3.5000)
+			'{ 32'hBD47900C, 32'h3F8328BC, 32'hBB4DD2CD },	// [31] pos_oct3_sub6: [3.5000, 3.7500)
+			'{ 32'hBCCC8719, 32'h3F818839, 32'hBABCBE73 },	// [32] pos_oct3_sub7: [3.7500, 4.0000)
+			'{ 32'hBC075296, 32'h3F8077C5, 32'hB9D4797E },	// [33] pos_oct4_sub0: [4.0000, 4.5000)
+			'{ 32'hBAB5A3CC, 32'h3F801236, 32'hB869FE3F },	// [34] pos_oct4_sub1: [4.5000, 5.0000)
+			'{ 32'hB935F6EF, 32'h3F800215, 32'hB6C346B7 },	// [35] pos_oct4_sub2: [5.0000, 5.5000)
+			'{ 32'hB78928DE, 32'h3F80002E, 32'hB4F89CA8 },	// [36] pos_oct4_sub3: [5.5000, 6.0000)
+			'{ 32'hB59C87B9, 32'h3F800003, 32'hB2F2851B },	// [37] pos_oct4_sub4: [6.0000, 6.5000)
+			'{ 32'hB387DBFC, 32'h3F800000, 32'hB0B5DB02 },	// [38] pos_oct4_sub5: [6.5000, 7.0000)
+			'{ 32'hB133EE82, 32'h3F800000, 32'hAE520F13 },	// [39] pos_oct4_sub6: [7.0000, 7.5000)
+			'{ 32'hAEB983D5, 32'h3F800000, 32'hABBEA652 },	// [40] pos_oct4_sub7: [7.5000, 8.0000)
+			'{ 32'hBA7AD2E9, 32'h3EFB0E98, 32'h3EBE374D },	// [41] neg_oct0_sub0: [-0.2812, -0.2500)
+			'{ 32'hBAC20AB3, 32'h3EF9272A, 32'h3EBAD579 },	// [42] neg_oct0_sub1: [-0.3125, -0.2812)
+			'{ 32'hBB0F6AA9, 32'h3EF6D5E1, 32'h3EB720D6 },	// [43] neg_oct0_sub2: [-0.3438, -0.3125)
+			'{ 32'hBB4C32E1, 32'h3EF41328, 32'h3EB31D68 },	// [44] neg_oct0_sub3: [-0.3750, -0.3438)
+			'{ 32'hBB8CE121, 32'h3EF0D863, 32'h3EAECF81 },	// [45] neg_oct0_sub4: [-0.4062, -0.3750)
+			'{ 32'hBBBD44F5, 32'h3EED1FFC, 32'h3EAA3BB6 },	// [46] neg_oct0_sub5: [-0.4375, -0.4062)
+			'{ 32'hBBF87F2B, 32'h3EE8E573, 32'h3EA566DF },	// [47] neg_oct0_sub6: [-0.4688, -0.4375)
+			'{ 32'hBC1FE374, 32'h3EE42563, 32'h3EA05607 },	// [48] neg_oct0_sub7: [-0.5000, -0.4688)
+			'{ 32'hBC6196C5, 32'h3EDC0A33, 32'h3E985541 },	// [49] neg_oct1_sub0: [-0.5625, -0.5000)
+			'{ 32'hBCA9FA9B, 32'h3ECF57F0, 32'h3E8D0D86 },	// [50] neg_oct1_sub1: [-0.6250, -0.5625)
+			'{ 32'hBCF402A2, 32'h3EC08BED, 32'h3E813805 },	// [51] neg_oct1_sub2: [-0.6875, -0.6250)
+			'{ 32'hBD28346B, 32'h3EAFC10C, 32'h3E6A0426 },	// [52] neg_oct1_sub3: [-0.7500, -0.6875)
+			'{ 32'hBD6013B5, 32'h3E9D22AA, 32'h3E513158 },	// [53] neg_oct1_sub4: [-0.8125, -0.7500)
+			'{ 32'hBD90E5E4, 32'h3E88EAEA, 32'h3E384F09 },	// [54] neg_oct1_sub5: [-0.8750, -0.8125)
+			'{ 32'hBDB698C5, 32'h3E66C11A, 32'h3E1FB052 },	// [55] neg_oct1_sub6: [-0.9375, -0.8750)
+			'{ 32'hBDE0DF89, 32'h3E39A8CD, 32'h3E07A270 },	// [56] neg_oct1_sub7: [-1.0000, -0.9375)
+			'{ 32'hBE13A45A, 32'h3DE83369, 32'h3DCA8921 },	// [57] neg_oct2_sub0: [-1.1250, -1.0000)
+			'{ 32'hBE483CB7, 32'h3CB440CF, 32'h3D6E8A6E },	// [58] neg_oct2_sub1: [-1.2500, -1.1250)
+			'{ 32'hBE7FAE79, 32'hBD848C82, 32'h3CC080D0 },	// [59] neg_oct2_sub2: [-1.3750, -1.2500)
+			'{ 32'hBE9AF2A1, 32'hBE113E1E, 32'hBB9670CC },	// [60] neg_oct2_sub3: [-1.5000, -1.3750)
+			'{ 32'hBEB35AF6, 32'hBE52722A, 32'hBCD3D62F },	// [61] neg_oct2_sub4: [-1.6250, -1.5000)
+			'{ 32'hBEC733B2, 32'hBE81B4A1, 32'hBD26553D },	// [62] neg_oct2_sub5: [-1.7500, -1.6250)
+			'{ 32'hBED511F1, 32'hBE919AE6, 32'hBD4ACBAB },	// [63] neg_oct2_sub6: [-1.8750, -1.7500)
+			'{ 32'hBEDC2153, 32'hBE992D8A, 32'hBD5B0B08 },	// [64] neg_oct2_sub7: [-2.0000, -1.8750)
+			'{ 32'hBED97943, 32'hBE96A77B, 32'hBD5641AE },	// [65] neg_oct3_sub0: [-2.2500, -2.0000)
+			'{ 32'hBEC1A84E, 32'hBE819B5D, 32'hBD310ADA },	// [66] neg_oct3_sub1: [-2.5000, -2.2500)
+			'{ 32'hBE9AF232, 32'hBE4550A3, 32'hBCFF13CB },	// [67] neg_oct3_sub2: [-2.7500, -2.5000)
+			'{ 32'hBE609A95, 32'hBE073747, 32'hBCA49F87 },	// [68] neg_oct3_sub3: [-3.0000, -2.7500)
+			'{ 32'hBE145DCE, 32'hBDA8924A, 32'hBC411EC8 },	// [69] neg_oct3_sub4: [-3.2500, -3.0000)
+			'{ 32'hBDB37CB4, 32'hBD4075A8, 32'hBBCFAA27 },	// [70] neg_oct3_sub5: [-3.5000, -3.2500)
+			'{ 32'hBD47900C, 32'hBCCA2F0E, 32'hBB4DD2CD },	// [71] neg_oct3_sub6: [-3.7500, -3.5000)
+			'{ 32'hBCCC8719, 32'hBC441CB8, 32'hBABCBE73 },	// [72] neg_oct3_sub7: [-4.0000, -3.7500)
+			'{ 32'hBC075296, 32'hBB6F8A8D, 32'hB9D4797E },	// [73] neg_oct4_sub0: [-4.5000, -4.0000)
+			'{ 32'hBAB5A3CC, 32'hBA11AE73, 32'hB869FE3F },	// [74] neg_oct4_sub1: [-5.0000, -4.5000)
+			'{ 32'hB935F6EF, 32'hB8853C9B, 32'hB6C346B7 },	// [75] neg_oct4_sub2: [-5.5000, -5.0000)
+			'{ 32'hB78928DF, 32'hB6B89CD1, 32'hB4F89CA9 },	// [76] neg_oct4_sub3: [-6.0000, -5.5000)
+			'{ 32'hB59C87B7, 32'hB4C2CE38, 32'hB2F28516 },	// [77] neg_oct4_sub4: [-6.5000, -6.0000)
+			'{ 32'hB387DC68, 32'hB29D2B70, 32'hB0B5DB95 },	// [78] neg_oct4_sub5: [-7.0000, -6.5000)
+			'{ 32'hB1340659, 32'hB0427F9C, 32'hAE52296C },	// [79] neg_oct4_sub6: [-7.5000, -7.0000)
+			'{ 32'hAEB6A669, 32'hADB910C4, 32'hABBB86A1 }	// [80] neg_oct4_sub7: [-8.0000, -7.5000)
+		}
+	};
+
+	localparam func_cfg_t  SILU = '{
+		neg_clamp:       32'h00000000,
+		pos_clamp:       32'h00000000,
+		pos_passthrough: 1,
+		coeffs: '{
+			'{ 32'h36E8E4D8, 32'h3F000000, 32'h3E7EDCA9 },	// [0] near_zero: [-0.2500, 0.2500)
+			'{ 32'hB99F0E43, 32'h3F00C85E, 32'h3E771F38 },	// [1] pos_oct0_sub0: [0.2500, 0.2812)
+			'{ 32'hB9F6D43F, 32'h3F01164A, 32'h3E74F597 },	// [2] pos_oct0_sub1: [0.2812, 0.3125)
+			'{ 32'hBA370988, 32'h3F017596, 32'h3E729418 },	// [3] pos_oct0_sub2: [0.3125, 0.3438)
+			'{ 32'hBA82C874, 32'h3F01E7B8, 32'h3E6FFC71 },	// [4] pos_oct0_sub3: [0.3438, 0.3750)
+			'{ 32'hBAB52EA2, 32'h3F026E06, 32'h3E6D3079 },	// [5] pos_oct0_sub4: [0.3750, 0.4062)
+			'{ 32'hBAF47A83, 32'h3F0309BD, 32'h3E6A3227 },	// [6] pos_oct0_sub5: [0.4062, 0.4375)
+			'{ 32'hBB213FE4, 32'h3F03BBFC, 32'h3E670392 },	// [7] pos_oct0_sub6: [0.4375, 0.4688)
+			'{ 32'hBB508F23, 32'h3F0485C2, 32'h3E63A6E9 },	// [8] pos_oct0_sub7: [0.4688, 0.5000)
+			'{ 32'hBB945DC5, 32'h3F05E1DA, 32'h3E5E4861 },	// [9] pos_oct1_sub0: [0.5000, 0.5625)
+			'{ 32'hBBE25EA1, 32'h3F080BF4, 32'h3E569783 },	// [10] pos_oct1_sub1: [0.5625, 0.6250)
+			'{ 32'hBC24C1FC, 32'h3F0A9F8F, 32'h3E4E59C7 },	// [11] pos_oct1_sub2: [0.6250, 0.6875)
+			'{ 32'hBC66B247, 32'h3F0D9E5F, 32'h3E45A3CF },	// [12] pos_oct1_sub3: [0.6875, 0.7500)
+			'{ 32'hBC9C5235, 32'h3F11080C, 32'h3E3C8A91 },	// [13] pos_oct1_sub4: [0.7500, 0.8125)
+			'{ 32'hBCCE03BB, 32'h3F14DA46, 32'h3E332307 },	// [14] pos_oct1_sub5: [0.8125, 0.8750)
+			'{ 32'hBD048260, 32'h3F1910E8, 32'h3E2981D7 },	// [15] pos_oct1_sub6: [0.8750, 0.9375)
+			'{ 32'hBD26E393, 32'h3F1DA629, 32'h3E1FBB0A },	// [16] pos_oct1_sub7: [0.9375, 1.0000)
+			'{ 32'hBD637E84, 32'h3F252120, 32'h3E10F470 },	// [17] pos_oct2_sub0: [1.0000, 1.1250)
+			'{ 32'hBDA33C0B, 32'h3F301FB8, 32'h3DFACFC9 },	// [18] pos_oct2_sub1: [1.1250, 1.2500)
+			'{ 32'hBDDE6AD0, 32'h3F3BF5DC, 32'h3DD4EC26 },	// [19] pos_oct2_sub2: [1.2500, 1.3750)
+			'{ 32'hBE112138, 32'h3F484C26, 32'h3DB1044F },	// [20] pos_oct2_sub3: [1.3750, 1.5000)
+			'{ 32'hBE369C04, 32'h3F54CB60, 32'h3D8FAC7F },	// [21] pos_oct2_sub4: [1.5000, 1.6250)
+			'{ 32'hBE5EB18B, 32'h3F612208, 32'h3D629168 },	// [22] pos_oct2_sub5: [1.6250, 1.7500)
+			'{ 32'hBE842940, 32'h3F6D0854, 32'h3D2C2276 },	// [23] pos_oct2_sub6: [1.7500, 1.8750)
+			'{ 32'hBE9933D8, 32'h3F7842BB, 32'h3CF863D2 },	// [24] pos_oct2_sub7: [1.8750, 2.0000)
+			'{ 32'hBEB7B0C1, 32'h3F83AB39, 32'h3C8110DA },	// [25] pos_oct3_sub0: [2.0000, 2.2500)
+			'{ 32'hBEDD223E, 32'h3F8C0325, 32'h3AA08170 },	// [26] pos_oct3_sub1: [2.2500, 2.5000)
+			'{ 32'hBEFBE17D, 32'h3F922E48, 32'hBC0A6454 },	// [27] pos_oct3_sub2: [2.5000, 2.7500)
+			'{ 32'hBF093230, 32'h3F9649E6, 32'hBC6A5E21 },	// [28] pos_oct3_sub3: [2.7500, 3.0000)
+			'{ 32'hBF101F60, 32'h3F989BC7, 32'hBC8E0D50 },	// [29] pos_oct3_sub4: [3.0000, 3.2500)
+			'{ 32'hBF12EF0A, 32'h3F997B2D, 32'hBC96B8B5 },	// [30] pos_oct3_sub5: [3.2500, 3.5000)
+			'{ 32'hBF121E34, 32'h3F9940CD, 32'hBC94AEB1 },	// [31] pos_oct3_sub6: [3.5000, 3.7500)
+			'{ 32'hBF0E4E33, 32'h3F983D51, 32'hBC8C0ED6 },	// [32] pos_oct3_sub7: [3.7500, 4.0000)
+			'{ 32'hBF04791A, 32'h3F95D0ED, 32'hBC71E16F },	// [33] pos_oct4_sub0: [4.0000, 4.5000)
+			'{ 32'hBEE59935, 32'h3F91E3AD, 32'hBC3A058C },	// [34] pos_oct4_sub1: [4.5000, 5.0000)
+			'{ 32'hBEBEB440, 32'h3F8DFEE7, 32'hBC081CC1 },	// [35] pos_oct4_sub2: [5.0000, 5.5000)
+			'{ 32'hBE9948D3, 32'h3F8A96AC, 32'hBBC0C8D1 },	// [36] pos_oct4_sub3: [5.5000, 6.0000)
+			'{ 32'hBE70101E, 32'h3F87D004, 32'hBB8571D8 },	// [37] pos_oct4_sub4: [6.0000, 6.5000)
+			'{ 32'hBE3805CC, 32'h3F85A739, 32'hBB35ABD2 },	// [38] pos_oct4_sub5: [6.5000, 7.0000)
+			'{ 32'hBE0A951B, 32'h3F840701, 32'hBAF42F39 },	// [39] pos_oct4_sub6: [7.0000, 7.5000)
+			'{ 32'hBDCD9ACB, 32'h3F82D521, 32'hBAA27541 },	// [40] pos_oct4_sub7: [7.5000, 8.0000)
+			'{ 32'hB99F0E43, 32'h3EFE6F44, 32'h3E771F38 },	// [41] neg_oct0_sub0: [-0.2812, -0.2500)
+			'{ 32'hB9F6D43F, 32'h3EFDD36C, 32'h3E74F597 },	// [42] neg_oct0_sub1: [-0.3125, -0.2812)
+			'{ 32'hBA370988, 32'h3EFD14D3, 32'h3E729418 },	// [43] neg_oct0_sub2: [-0.3438, -0.3125)
+			'{ 32'hBA82C874, 32'h3EFC3091, 32'h3E6FFC71 },	// [44] neg_oct0_sub3: [-0.3750, -0.3438)
+			'{ 32'hBAB52EA2, 32'h3EFB23F5, 32'h3E6D3079 },	// [45] neg_oct0_sub4: [-0.4062, -0.3750)
+			'{ 32'hBAF47A83, 32'h3EF9EC85, 32'h3E6A3227 },	// [46] neg_oct0_sub5: [-0.4375, -0.4062)
+			'{ 32'hBB213FE4, 32'h3EF88807, 32'h3E670392 },	// [47] neg_oct0_sub6: [-0.4688, -0.4375)
+			'{ 32'hBB508F23, 32'h3EF6F47D, 32'h3E63A6E9 },	// [48] neg_oct0_sub7: [-0.5000, -0.4688)
+			'{ 32'hBB945DC5, 32'h3EF43C4C, 32'h3E5E4861 },	// [49] neg_oct1_sub0: [-0.5625, -0.5000)
+			'{ 32'hBBE25EA1, 32'h3EEFE817, 32'h3E569783 },	// [50] neg_oct1_sub1: [-0.6250, -0.5625)
+			'{ 32'hBC24C1FC, 32'h3EEAC0E2, 32'h3E4E59C7 },	// [51] neg_oct1_sub2: [-0.6875, -0.6250)
+			'{ 32'hBC66B247, 32'h3EE4C342, 32'h3E45A3CF },	// [52] neg_oct1_sub3: [-0.7500, -0.6875)
+			'{ 32'hBC9C5235, 32'h3EDDEFE8, 32'h3E3C8A91 },	// [53] neg_oct1_sub4: [-0.8125, -0.7500)
+			'{ 32'hBCCE03BB, 32'h3ED64B75, 32'h3E332307 },	// [54] neg_oct1_sub5: [-0.8750, -0.8125)
+			'{ 32'hBD048260, 32'h3ECDDE30, 32'h3E2981D7 },	// [55] neg_oct1_sub6: [-0.9375, -0.8750)
+			'{ 32'hBD26E393, 32'h3EC4B3AE, 32'h3E1FBB0A },	// [56] neg_oct1_sub7: [-1.0000, -0.9375)
+			'{ 32'hBD637E84, 32'h3EB5BDC0, 32'h3E10F470 },	// [57] neg_oct2_sub0: [-1.1250, -1.0000)
+			'{ 32'hBDA33C0B, 32'h3E9FC090, 32'h3DFACFC9 },	// [58] neg_oct2_sub1: [-1.2500, -1.1250)
+			'{ 32'hBDDE6AD0, 32'h3E881448, 32'h3DD4EC26 },	// [59] neg_oct2_sub2: [-1.3750, -1.2500)
+			'{ 32'hBE112138, 32'h3E5ECF69, 32'h3DB1044F },	// [60] neg_oct2_sub3: [-1.5000, -1.3750)
+			'{ 32'hBE369C04, 32'h3E2CD280, 32'h3D8FAC7F },	// [61] neg_oct2_sub4: [-1.6250, -1.5000)
+			'{ 32'hBE5EB18B, 32'h3DF6EFBF, 32'h3D629168 },	// [62] neg_oct2_sub5: [-1.7500, -1.6250)
+			'{ 32'hBE842940, 32'h3D97BD5C, 32'h3D2C2276 },	// [63] neg_oct2_sub6: [-1.8750, -1.7500)
+			'{ 32'hBE9933D8, 32'h3CF7A895, 32'h3CF863D2 },	// [64] neg_oct2_sub7: [-2.0000, -1.8750)
+			'{ 32'hBEB7B0C1, 32'hBCEACE5E, 32'h3C8110DA },	// [65] neg_oct3_sub0: [-2.2500, -2.0000)
+			'{ 32'hBEDD223E, 32'hBDC03253, 32'h3AA08170 },	// [66] neg_oct3_sub1: [-2.5000, -2.2500)
+			'{ 32'hBEFBE17D, 32'hBE11723E, 32'hBC0A6454 },	// [67] neg_oct3_sub2: [-2.7500, -2.5000)
+			'{ 32'hBF093230, 32'hBE324F32, 32'hBC6A5E21 },	// [68] neg_oct3_sub3: [-3.0000, -2.7500)
+			'{ 32'hBF101F60, 32'hBE44DE3B, 32'hBC8E0D50 },	// [69] neg_oct3_sub4: [-3.2500, -3.0000)
+			'{ 32'hBF12EF0A, 32'hBE4BD96A, 32'hBC96B8B5 },	// [70] neg_oct3_sub5: [-3.5000, -3.2500)
+			'{ 32'hBF121E34, 32'hBE4A0667, 32'hBC94AEB1 },	// [71] neg_oct3_sub6: [-3.7500, -3.5000)
+			'{ 32'hBF0E4E33, 32'hBE41EA8C, 32'hBC8C0ED6 },	// [72] neg_oct3_sub7: [-4.0000, -3.7500)
+			'{ 32'hBF04791A, 32'hBE2E8765, 32'hBC71E16F },	// [73] neg_oct4_sub0: [-4.5000, -4.0000)
+			'{ 32'hBEE59935, 32'hBE0F1D69, 32'hBC3A058C },	// [74] neg_oct4_sub1: [-5.0000, -4.5000)
+			'{ 32'hBEBEB440, 32'hBDDFEE68, 32'hBC081CC1 },	// [75] neg_oct4_sub2: [-5.5000, -5.0000)
+			'{ 32'hBE9948D3, 32'hBDA96ABE, 32'hBBC0C8D1 },	// [76] neg_oct4_sub3: [-6.0000, -5.5000)
+			'{ 32'hBE70101E, 32'hBD7A008A, 32'hBB8571D8 },	// [77] neg_oct4_sub4: [-6.5000, -6.0000)
+			'{ 32'hBE3805CC, 32'hBD34E717, 32'hBB35ABD2 },	// [78] neg_oct4_sub5: [-7.0000, -6.5000)
+			'{ 32'hBE0A951B, 32'hBD00E023, 32'hBAF42F39 },	// [79] neg_oct4_sub6: [-7.5000, -7.0000)
+			'{ 32'hBDCD9ACB, 32'hBCB54833, 32'hBAA27541 }	// [80] neg_oct4_sub7: [-8.0000, -7.5000)
+		}
+	};
+
+	localparam func_cfg_t  SIGMOID = '{
+		neg_clamp:       32'h00000000,
+		pos_clamp:       32'h3F800000,
+		pos_passthrough: 0,
+		coeffs: '{
+			'{ 32'h3F000000, 32'h3E7F33E9, 32'h00000000 },	// [0] near_zero: [-0.2500, 0.2500)
+			'{ 32'h3EFFCF0E, 32'h3E822D89, 32'hBC84D823 },	// [1] pos_oct0_sub0: [0.2500, 0.2812)
+			'{ 32'h3EFFBC5D, 32'h3E82B26C, 32'hBC939CA5 },	// [2] pos_oct0_sub1: [0.2812, 0.3125)
+			'{ 32'h3EFFA5B9, 32'h3E834349, 32'hBCA219C6 },	// [3] pos_oct0_sub2: [0.3125, 0.3438)
+			'{ 32'h3EFF8AE8, 32'h3E83DF4C, 32'hBCB04918 },	// [4] pos_oct0_sub3: [0.3438, 0.3750)
+			'{ 32'h3EFF6BBB, 32'h3E84858E, 32'hBCBE247C },	// [5] pos_oct0_sub4: [0.3750, 0.4062)
+			'{ 32'h3EFF4812, 32'h3E85351C, 32'hBCCBA623 },	// [6] pos_oct0_sub5: [0.4062, 0.4375)
+			'{ 32'h3EFF1FDA, 32'h3E85ECF7, 32'hBCD8C899 },	// [7] pos_oct0_sub6: [0.4375, 0.4688)
+			'{ 32'h3EFEF30F, 32'h3E86AC14, 32'hBCE586C5 },	// [8] pos_oct0_sub7: [0.4688, 0.5000)
+			'{ 32'h3EFEA7FD, 32'h3E87D4B2, 32'hBCF7D7C9 },	// [9] pos_oct1_sub0: [0.5000, 0.5625)
+			'{ 32'h3EFE340F, 32'h3E897100, 32'hBD0761C6 },	// [10] pos_oct1_sub1: [0.5625, 0.6250)
+			'{ 32'h3EFDB07F, 32'h3E8B1626, 32'hBD11EAFB },	// [11] pos_oct1_sub2: [0.6250, 0.6875)
+			'{ 32'h3EFD1FEF, 32'h3E8CBAE0, 32'hBD1B7C9A },	// [12] pos_oct1_sub3: [0.6875, 0.7500)
+			'{ 32'h3EFC85D6, 32'h3E8E5602, 32'hBD240EF3 },	// [13] pos_oct1_sub4: [0.7500, 0.8125)
+			'{ 32'h3EFBE66B, 32'h3E8FDEA9, 32'hBD2B9D89 },	// [14] pos_oct1_sub5: [0.8125, 0.8750)
+			'{ 32'h3EFB4684, 32'h3E914C67, 32'hBD3226EB },	// [15] pos_oct1_sub6: [0.8750, 0.9375)
+			'{ 32'h3EFAAB7A, 32'h3E929769, 32'hBD37AC88 },	// [16] pos_oct1_sub7: [0.9375, 1.0000)
+			'{ 32'h3EF9DB97, 32'h3E9432F4, 32'hBD3E0A59 },	// [17] pos_oct2_sub0: [1.0000, 1.1250)
+			'{ 32'h3EF90A04, 32'h3E95A99D, 32'hBD434674 },	// [18] pos_oct2_sub1: [1.1250, 1.2500)
+			'{ 32'h3EF8B5B0, 32'h3E963283, 32'hBD4502F9 },	// [19] pos_oct2_sub2: [1.2500, 1.3750)
+			'{ 32'h3EF90A01, 32'h3E95B9A5, 32'hBD43A873 },	// [20] pos_oct2_sub3: [1.3750, 1.5000)
+			'{ 32'h3EFA2B30, 32'h3E94399D, 32'hBD3FAC42 },	// [21] pos_oct2_sub4: [1.5000, 1.6250)
+			'{ 32'h3EFC3409, 32'h3E91B9DE, 32'hBD39885F },	// [22] pos_oct2_sub5: [1.6250, 1.7500)
+			'{ 32'h3EFF34CB, 32'h3E8E4C54, 32'hBD31B499 },	// [23] pos_oct2_sub6: [1.7500, 1.8750)
+			'{ 32'h3F019987, 32'h3E8A0AB5, 32'hBD28A16E },	// [24] pos_oct2_sub7: [1.8750, 2.0000)
+			'{ 32'h3F057A9E, 32'h3E8262A4, 32'hBD1983F9 },	// [25] pos_oct3_sub0: [2.0000, 2.2500)
+			'{ 32'h3F0C3068, 32'h3E6CECEE, 32'hBD0452A4 },	// [26] pos_oct3_sub1: [2.2500, 2.5000)
+			'{ 32'h3F1447FB, 32'h3E530738, 32'hBCDF3123 },	// [27] pos_oct3_sub2: [2.5000, 2.7500)
+			'{ 32'h3F1D4A57, 32'h3E38CEB4, 32'hBCB90648 },	// [28] pos_oct3_sub3: [2.7500, 3.0000)
+			'{ 32'h3F26C116, 32'h3E1F8DBB, 32'hBC975293 },	// [29] pos_oct3_sub4: [3.0000, 3.2500)
+			'{ 32'h3F30448D, 32'h3E081E0B, 32'hBC74E633 },	// [30] pos_oct3_sub5: [3.2500, 3.5000)
+			'{ 32'h3F398293, 32'h3DE5F2D2, 32'hBC4485E9 },	// [31] pos_oct3_sub6: [3.5000, 3.7500)
+			'{ 32'h3F423FDE, 32'h3DC0A0B4, 32'hBC1CAC71 },	// [32] pos_oct3_sub7: [3.7500, 4.0000)
+			'{ 32'h3F4DF46F, 32'h3D924A89, 32'hBBDD9C65 },	// [33] pos_oct4_sub0: [4.0000, 4.5000)
+			'{ 32'h3F5B2AC0, 32'h3D464E5F, 32'hBB897CEB },	// [34] pos_oct4_sub1: [4.5000, 5.0000)
+			'{ 32'h3F656FD2, 32'h3D045D55, 32'hBB291712 },	// [35] pos_oct4_sub2: [5.0000, 5.5000)
+			'{ 32'h3F6D25B4, 32'h3CAEBB3F, 32'hBACED637 },	// [36] pos_oct4_sub3: [5.5000, 6.0000)
+			'{ 32'h3F72CA7A, 32'h3C64B7EA, 32'hBA7C2E52 },	// [37] pos_oct4_sub4: [6.0000, 6.5000)
+			'{ 32'h3F76D7C3, 32'h3C14B63C, 32'hBA196D41 },	// [38] pos_oct4_sub5: [6.5000, 7.0000)
+			'{ 32'h3F79B550, 32'h3BC05DAC, 32'hB9BA76BE },	// [39] pos_oct4_sub6: [7.0000, 7.5000)
+			'{ 32'h3F7BB5C3, 32'h3B77C0C9, 32'hB96272C7 },	// [40] pos_oct4_sub7: [7.5000, 8.0000)
+			'{ 32'h3F001879, 32'h3E822D89, 32'h3C84D823 },	// [41] neg_oct0_sub0: [-0.2812, -0.2500)
+			'{ 32'h3F0021D2, 32'h3E82B26C, 32'h3C939CA5 },	// [42] neg_oct0_sub1: [-0.3125, -0.2812)
+			'{ 32'h3F002D23, 32'h3E834349, 32'h3CA219C6 },	// [43] neg_oct0_sub2: [-0.3438, -0.3125)
+			'{ 32'h3F003A8C, 32'h3E83DF4C, 32'h3CB04918 },	// [44] neg_oct0_sub3: [-0.3750, -0.3438)
+			'{ 32'h3F004A22, 32'h3E84858E, 32'h3CBE247C },	// [45] neg_oct0_sub4: [-0.4062, -0.3750)
+			'{ 32'h3F005BF7, 32'h3E85351C, 32'h3CCBA623 },	// [46] neg_oct0_sub5: [-0.4375, -0.4062)
+			'{ 32'h3F007013, 32'h3E85ECF7, 32'h3CD8C899 },	// [47] neg_oct0_sub6: [-0.4688, -0.4375)
+			'{ 32'h3F008678, 32'h3E86AC14, 32'h3CE586C5 },	// [48] neg_oct0_sub7: [-0.5000, -0.4688)
+			'{ 32'h3F00AC01, 32'h3E87D4B2, 32'h3CF7D7C9 },	// [49] neg_oct1_sub0: [-0.5625, -0.5000)
+			'{ 32'h3F00E5F9, 32'h3E897100, 32'h3D0761C6 },	// [50] neg_oct1_sub1: [-0.6250, -0.5625)
+			'{ 32'h3F0127C1, 32'h3E8B1626, 32'h3D11EAFB },	// [51] neg_oct1_sub2: [-0.6875, -0.6250)
+			'{ 32'h3F017009, 32'h3E8CBAE0, 32'h3D1B7C9A },	// [52] neg_oct1_sub3: [-0.7500, -0.6875)
+			'{ 32'h3F01BD15, 32'h3E8E5602, 32'h3D240EF3 },	// [53] neg_oct1_sub4: [-0.8125, -0.7500)
+			'{ 32'h3F020CCB, 32'h3E8FDEA9, 32'h3D2B9D89 },	// [54] neg_oct1_sub5: [-0.8750, -0.8125)
+			'{ 32'h3F025CBE, 32'h3E914C67, 32'h3D3226EB },	// [55] neg_oct1_sub6: [-0.9375, -0.8750)
+			'{ 32'h3F02AA43, 32'h3E929769, 32'h3D37AC88 },	// [56] neg_oct1_sub7: [-1.0000, -0.9375)
+			'{ 32'h3F031234, 32'h3E9432F4, 32'h3D3E0A59 },	// [57] neg_oct2_sub0: [-1.1250, -1.0000)
+			'{ 32'h3F037AFE, 32'h3E95A99D, 32'h3D434674 },	// [58] neg_oct2_sub1: [-1.2500, -1.1250)
+			'{ 32'h3F03A528, 32'h3E963283, 32'h3D4502F9 },	// [59] neg_oct2_sub2: [-1.3750, -1.2500)
+			'{ 32'h3F037AFF, 32'h3E95B9A5, 32'h3D43A873 },	// [60] neg_oct2_sub3: [-1.5000, -1.3750)
+			'{ 32'h3F02EA68, 32'h3E94399D, 32'h3D3FAC42 },	// [61] neg_oct2_sub4: [-1.6250, -1.5000)
+			'{ 32'h3F01E5FC, 32'h3E91B9DE, 32'h3D39885F },	// [62] neg_oct2_sub5: [-1.7500, -1.6250)
+			'{ 32'h3F00659B, 32'h3E8E4C54, 32'h3D31B499 },	// [63] neg_oct2_sub6: [-1.8750, -1.7500)
+			'{ 32'h3EFCCCF2, 32'h3E8A0AB5, 32'h3D28A16E },	// [64] neg_oct2_sub7: [-2.0000, -1.8750)
+			'{ 32'h3EF50AC5, 32'h3E8262A4, 32'h3D1983F9 },	// [65] neg_oct3_sub0: [-2.2500, -2.0000)
+			'{ 32'h3EE79F30, 32'h3E6CECEE, 32'h3D0452A4 },	// [66] neg_oct3_sub1: [-2.5000, -2.2500)
+			'{ 32'h3ED7700A, 32'h3E530738, 32'h3CDF3123 },	// [67] neg_oct3_sub2: [-2.7500, -2.5000)
+			'{ 32'h3EC56B51, 32'h3E38CEB4, 32'h3CB90648 },	// [68] neg_oct3_sub3: [-3.0000, -2.7500)
+			'{ 32'h3EB27DD3, 32'h3E1F8DBB, 32'h3C975293 },	// [69] neg_oct3_sub4: [-3.2500, -3.0000)
+			'{ 32'h3E9F76E6, 32'h3E081E0B, 32'h3C74E633 },	// [70] neg_oct3_sub5: [-3.5000, -3.2500)
+			'{ 32'h3E8CFAD9, 32'h3DE5F2D2, 32'h3C4485E9 },	// [71] neg_oct3_sub6: [-3.7500, -3.5000)
+			'{ 32'h3E770088, 32'h3DC0A0B4, 32'h3C1CAC71 },	// [72] neg_oct3_sub7: [-4.0000, -3.7500)
+			'{ 32'h3E482E45, 32'h3D924A89, 32'h3BDD9C65 },	// [73] neg_oct4_sub0: [-4.5000, -4.0000)
+			'{ 32'h3E1354FF, 32'h3D464E5F, 32'h3B897CEB },	// [74] neg_oct4_sub1: [-5.0000, -4.5000)
+			'{ 32'h3DD48171, 32'h3D045D55, 32'h3B291712 },	// [75] neg_oct4_sub2: [-5.5000, -5.0000)
+			'{ 32'h3D96D261, 32'h3CAEBB3F, 32'h3ACED637 },	// [76] neg_oct4_sub3: [-6.0000, -5.5000)
+			'{ 32'h3D535864, 32'h3C64B7EA, 32'h3A7C2E52 },	// [77] neg_oct4_sub4: [-6.5000, -6.0000)
+			'{ 32'h3D1283CA, 32'h3C14B63C, 32'h3A196D41 },	// [78] neg_oct4_sub5: [-7.0000, -6.5000)
+			'{ 32'h3CC95606, 32'h3BC05DAC, 32'h39BA76BE },	// [79] neg_oct4_sub6: [-7.5000, -7.0000)
+			'{ 32'h3C894795, 32'h3B77C0C9, 32'h396272C7 }	// [80] neg_oct4_sub7: [-8.0000, -7.5000)
+		}
+	};
+
+	localparam func_cfg_t  TANH = '{
+		neg_clamp:       32'hBF800000,
+		pos_clamp:       32'h3F800000,
+		pos_passthrough: 0,
+		coeffs: '{
+			'{ 32'hA1B504F3, 32'h3F7CDA60, 32'h00000000 },	// [0] near_zero: [-0.2500, 0.2500)
+			'{ 32'hBBAC0178, 32'h3F87D4B2, 32'hBE77D7C9 },	// [1] pos_oct0_sub0: [0.2500, 0.2812)
+			'{ 32'hBBE5F89C, 32'h3F897100, 32'hBE8761C6 },	// [2] pos_oct0_sub1: [0.2812, 0.3125)
+			'{ 32'hBC13E055, 32'h3F8B1626, 32'hBE91EAFB },	// [3] pos_oct0_sub2: [0.3125, 0.3438)
+			'{ 32'hBC38044C, 32'h3F8CBAE0, 32'hBE9B7C9A },	// [4] pos_oct0_sub3: [0.3438, 0.3750)
+			'{ 32'hBC5E8A68, 32'h3F8E5602, 32'hBEA40EF3 },	// [5] pos_oct0_sub4: [0.3750, 0.4062)
+			'{ 32'hBC8332A5, 32'h3F8FDEA9, 32'hBEAB9D89 },	// [6] pos_oct0_sub5: [0.4062, 0.4375)
+			'{ 32'hBC972F8C, 32'h3F914C67, 32'hBEB226EB },	// [7] pos_oct0_sub6: [0.4375, 0.4688)
+			'{ 32'hBCAA90CC, 32'h3F929769, 32'hBEB7AC88 },	// [8] pos_oct0_sub7: [0.4688, 0.5000)
+			'{ 32'hBCC48D18, 32'h3F9432F4, 32'hBEBE0A59 },	// [9] pos_oct1_sub0: [0.5000, 0.5625)
+			'{ 32'hBCDEBF75, 32'h3F95A99D, 32'hBEC34674 },	// [10] pos_oct1_sub1: [0.5625, 0.6250)
+			'{ 32'hBCE94A05, 32'h3F963283, 32'hBEC502F9 },	// [11] pos_oct1_sub2: [0.6250, 0.6875)
+			'{ 32'hBCDEBFD9, 32'h3F95B9A5, 32'hBEC3A873 },	// [12] pos_oct1_sub3: [0.6875, 0.7500)
+			'{ 32'hBCBA99F8, 32'h3F94399D, 32'hBEBFAC42 },	// [13] pos_oct1_sub4: [0.7500, 0.8125)
+			'{ 32'hBC72FDCF, 32'h3F91B9DE, 32'hBEB9885F },	// [14] pos_oct1_sub5: [0.8125, 0.8750)
+			'{ 32'hBB4B3567, 32'h3F8E4C54, 32'hBEB1B499 },	// [15] pos_oct1_sub6: [0.8750, 0.9375)
+			'{ 32'h3C4CC375, 32'h3F8A0AB5, 32'hBEA8A16E },	// [16] pos_oct1_sub7: [0.9375, 1.0000)
+			'{ 32'h3D2F53B5, 32'h3F8262A4, 32'hBE9983F9 },	// [17] pos_oct2_sub0: [1.0000, 1.1250)
+			'{ 32'h3DC3067D, 32'h3F6CECEE, 32'hBE8452A4 },	// [18] pos_oct2_sub1: [1.1250, 1.2500)
+			'{ 32'h3E223FD6, 32'h3F530738, 32'hBE5F3123 },	// [19] pos_oct2_sub2: [1.2500, 1.3750)
+			'{ 32'h3E6A52BB, 32'h3F38CEB4, 32'hBE390648 },	// [20] pos_oct2_sub3: [1.3750, 1.5000)
+			'{ 32'h3E9B0459, 32'h3F1F8DBB, 32'hBE175293 },	// [21] pos_oct2_sub4: [1.5000, 1.6250)
+			'{ 32'h3EC11234, 32'h3F081E0B, 32'hBDF4E633 },	// [22] pos_oct2_sub5: [1.6250, 1.7500)
+			'{ 32'h3EE60A4E, 32'h3EE5F2D2, 32'hBDC485E9 },	// [23] pos_oct2_sub6: [1.7500, 1.8750)
+			'{ 32'h3F047FBC, 32'h3EC0A0B4, 32'hBD9CAC71 },	// [24] pos_oct2_sub7: [1.8750, 2.0000)
+			'{ 32'h3F1BE8DE, 32'h3E924A89, 32'hBD5D9C65 },	// [25] pos_oct3_sub0: [2.0000, 2.2500)
+			'{ 32'h3F365580, 32'h3E464E5F, 32'hBD097CEB },	// [26] pos_oct3_sub1: [2.2500, 2.5000)
+			'{ 32'h3F4ADFA4, 32'h3E045D55, 32'hBCA91712 },	// [27] pos_oct3_sub2: [2.5000, 2.7500)
+			'{ 32'h3F5A4B68, 32'h3DAEBB3F, 32'hBC4ED637 },	// [28] pos_oct3_sub3: [2.7500, 3.0000)
+			'{ 32'h3F6594F3, 32'h3D64B7EA, 32'hBBFC2E52 },	// [29] pos_oct3_sub4: [3.0000, 3.2500)
+			'{ 32'h3F6DAF87, 32'h3D14B63C, 32'hBB996D41 },	// [30] pos_oct3_sub5: [3.2500, 3.5000)
+			'{ 32'h3F736AA0, 32'h3CC05DAC, 32'hBB3A76BE },	// [31] pos_oct3_sub6: [3.5000, 3.7500)
+			'{ 32'h3F776B87, 32'h3C77C0C9, 32'hBAE272C7 },	// [32] pos_oct3_sub7: [3.7500, 4.0000)
+			'{ 32'h3F7B291E, 32'h3C00F319, 32'hBA590184 },	// [33] pos_oct4_sub0: [4.0000, 4.5000)
+			'{ 32'h3F7DD41A, 32'h3B51D16D, 32'hB99FC02C },	// [34] pos_oct4_sub1: [4.5000, 5.0000)
+			'{ 32'h3F7F0B06, 32'h3AA9199D, 32'hB8EB1F74 },	// [35] pos_oct4_sub2: [5.0000, 5.5000)
+			'{ 32'h3F7F95A4, 32'h3A073D0F, 32'hB82D01A6 },	// [36] pos_oct4_sub3: [5.5000, 6.0000)
+			'{ 32'h3F7FD268, 32'h3956EC40, 32'hB77E96C4 },	// [37] pos_oct4_sub4: [6.0000, 6.5000)
+			'{ 32'h3F7FECAA, 32'h38A9D71D, 32'hB6BB5164 },	// [38] pos_oct4_sub5: [6.5000, 7.0000)
+			'{ 32'h3F7FF7E0, 32'h38059366, 32'hB609D243 },	// [39] pos_oct4_sub6: [7.0000, 7.5000)
+			'{ 32'h3F7FFC9E, 32'h37513C1C, 32'hB54ACE8B },	// [40] pos_oct4_sub7: [7.5000, 8.0000)
+			'{ 32'h3BAC0178, 32'h3F87D4B2, 32'h3E77D7C9 },	// [41] neg_oct0_sub0: [-0.2812, -0.2500)
+			'{ 32'h3BE5F89C, 32'h3F897100, 32'h3E8761C6 },	// [42] neg_oct0_sub1: [-0.3125, -0.2812)
+			'{ 32'h3C13E055, 32'h3F8B1626, 32'h3E91EAFB },	// [43] neg_oct0_sub2: [-0.3438, -0.3125)
+			'{ 32'h3C38044C, 32'h3F8CBAE0, 32'h3E9B7C9A },	// [44] neg_oct0_sub3: [-0.3750, -0.3438)
+			'{ 32'h3C5E8A68, 32'h3F8E5602, 32'h3EA40EF3 },	// [45] neg_oct0_sub4: [-0.4062, -0.3750)
+			'{ 32'h3C8332A5, 32'h3F8FDEA9, 32'h3EAB9D89 },	// [46] neg_oct0_sub5: [-0.4375, -0.4062)
+			'{ 32'h3C972F8C, 32'h3F914C67, 32'h3EB226EB },	// [47] neg_oct0_sub6: [-0.4688, -0.4375)
+			'{ 32'h3CAA90CC, 32'h3F929769, 32'h3EB7AC88 },	// [48] neg_oct0_sub7: [-0.5000, -0.4688)
+			'{ 32'h3CC48D18, 32'h3F9432F4, 32'h3EBE0A59 },	// [49] neg_oct1_sub0: [-0.5625, -0.5000)
+			'{ 32'h3CDEBF75, 32'h3F95A99D, 32'h3EC34674 },	// [50] neg_oct1_sub1: [-0.6250, -0.5625)
+			'{ 32'h3CE94A05, 32'h3F963283, 32'h3EC502F9 },	// [51] neg_oct1_sub2: [-0.6875, -0.6250)
+			'{ 32'h3CDEBFD9, 32'h3F95B9A5, 32'h3EC3A873 },	// [52] neg_oct1_sub3: [-0.7500, -0.6875)
+			'{ 32'h3CBA99F8, 32'h3F94399D, 32'h3EBFAC42 },	// [53] neg_oct1_sub4: [-0.8125, -0.7500)
+			'{ 32'h3C72FDCF, 32'h3F91B9DE, 32'h3EB9885F },	// [54] neg_oct1_sub5: [-0.8750, -0.8125)
+			'{ 32'h3B4B3567, 32'h3F8E4C54, 32'h3EB1B499 },	// [55] neg_oct1_sub6: [-0.9375, -0.8750)
+			'{ 32'hBC4CC375, 32'h3F8A0AB5, 32'h3EA8A16E },	// [56] neg_oct1_sub7: [-1.0000, -0.9375)
+			'{ 32'hBD2F53B5, 32'h3F8262A4, 32'h3E9983F9 },	// [57] neg_oct2_sub0: [-1.1250, -1.0000)
+			'{ 32'hBDC3067D, 32'h3F6CECEE, 32'h3E8452A4 },	// [58] neg_oct2_sub1: [-1.2500, -1.1250)
+			'{ 32'hBE223FD6, 32'h3F530738, 32'h3E5F3123 },	// [59] neg_oct2_sub2: [-1.3750, -1.2500)
+			'{ 32'hBE6A52BB, 32'h3F38CEB4, 32'h3E390648 },	// [60] neg_oct2_sub3: [-1.5000, -1.3750)
+			'{ 32'hBE9B0459, 32'h3F1F8DBB, 32'h3E175293 },	// [61] neg_oct2_sub4: [-1.6250, -1.5000)
+			'{ 32'hBEC11234, 32'h3F081E0B, 32'h3DF4E633 },	// [62] neg_oct2_sub5: [-1.7500, -1.6250)
+			'{ 32'hBEE60A4E, 32'h3EE5F2D2, 32'h3DC485E9 },	// [63] neg_oct2_sub6: [-1.8750, -1.7500)
+			'{ 32'hBF047FBC, 32'h3EC0A0B4, 32'h3D9CAC71 },	// [64] neg_oct2_sub7: [-2.0000, -1.8750)
+			'{ 32'hBF1BE8DE, 32'h3E924A89, 32'h3D5D9C65 },	// [65] neg_oct3_sub0: [-2.2500, -2.0000)
+			'{ 32'hBF365580, 32'h3E464E5F, 32'h3D097CEB },	// [66] neg_oct3_sub1: [-2.5000, -2.2500)
+			'{ 32'hBF4ADFA4, 32'h3E045D55, 32'h3CA91712 },	// [67] neg_oct3_sub2: [-2.7500, -2.5000)
+			'{ 32'hBF5A4B68, 32'h3DAEBB3F, 32'h3C4ED637 },	// [68] neg_oct3_sub3: [-3.0000, -2.7500)
+			'{ 32'hBF6594F3, 32'h3D64B7EA, 32'h3BFC2E52 },	// [69] neg_oct3_sub4: [-3.2500, -3.0000)
+			'{ 32'hBF6DAF87, 32'h3D14B63C, 32'h3B996D41 },	// [70] neg_oct3_sub5: [-3.5000, -3.2500)
+			'{ 32'hBF736AA0, 32'h3CC05DAC, 32'h3B3A76BE },	// [71] neg_oct3_sub6: [-3.7500, -3.5000)
+			'{ 32'hBF776B87, 32'h3C77C0C9, 32'h3AE272C7 },	// [72] neg_oct3_sub7: [-4.0000, -3.7500)
+			'{ 32'hBF7B291E, 32'h3C00F319, 32'h3A590184 },	// [73] neg_oct4_sub0: [-4.5000, -4.0000)
+			'{ 32'hBF7DD41A, 32'h3B51D16D, 32'h399FC02C },	// [74] neg_oct4_sub1: [-5.0000, -4.5000)
+			'{ 32'hBF7F0B06, 32'h3AA9199D, 32'h38EB1F74 },	// [75] neg_oct4_sub2: [-5.5000, -5.0000)
+			'{ 32'hBF7F95A4, 32'h3A073D0F, 32'h382D01A6 },	// [76] neg_oct4_sub3: [-6.0000, -5.5000)
+			'{ 32'hBF7FD268, 32'h3956EC40, 32'h377E96C4 },	// [77] neg_oct4_sub4: [-6.5000, -6.0000)
+			'{ 32'hBF7FECAA, 32'h38A9D71D, 32'h36BB5164 },	// [78] neg_oct4_sub5: [-7.0000, -6.5000)
+			'{ 32'hBF7FF7E0, 32'h38059366, 32'h3609D243 },	// [79] neg_oct4_sub6: [-7.5000, -7.0000)
+			'{ 32'hBF7FFC9E, 32'h37513C1C, 32'h354ACE8B }	// [80] neg_oct4_sub7: [-8.0000, -7.5000)
+		}
+	};
+
+endpackage
\ No newline at end of file
diff --git a/finn-rtllib/pwpolyf/hdl/pwpolyf_tb.sv b/finn-rtllib/pwpolyf/hdl/pwpolyf_tb.sv
new file mode 100644
index 0000000000..574b073b3e
--- /dev/null
+++ b/finn-rtllib/pwpolyf/hdl/pwpolyf_tb.sv
@@ -0,0 +1,145 @@
+/****************************************************************************
+ * Copyright (C) 2026, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * @brief	Testbench for pwpolyf: FP32 piecewise polynomial activation.
+ * @author	Shane Fleming <shane.fleming@amd.com>
+ *
+ * @description
+ *	Tests all four activation functions (gelu, silu, sigmoid, tanh) in
+ *	parallel using random FP32 stimulus with online shortreal-based
+ *	checking against a reference function.
+ ***************************************************************************/
+
+module pwpolyf_tb;
+
+	localparam int unsigned  TEST_COUNT = 4;
+	localparam string  FUNCS[TEST_COUNT] = '{"gelu", "silu", "sigmoid", "tanh"};
+	localparam int unsigned  RUNS = 4096;
+
+	// Global Control
+	logic  clk = 0;
+	always #5ns clk = !clk;
+	logic  rst = 1;
+	initial begin
+		repeat(12) @(posedge clk);
+		rst <= 0;
+	end
+
+	bit [TEST_COUNT-1:0]  done = '0;
+	always_comb begin
+		if(&done)  $finish;
+	end
+
+	for(genvar  t = 0; t < TEST_COUNT; t++) begin : genTests
+		localparam string  FUNC = FUNCS[t];
+
+		// DUT wired for PE=1
+		logic [31:0]  xdat;
+		logic  xvld;
+		uwire  xrdy;
+		uwire [31:0]  ydat;
+		uwire  yvld;
+		logic  yrdy;
+
+		pwpolyf #(.PE(1), .FUNC(FUNC)) dut (
+			.clk, .rst,
+			.xdat, .xvld, .xrdy,
+			.ydat, .yvld, .yrdy
+		);
+		shortreal  y;
+		assign  y = $bitstoshortreal(ydat);
+
+		// Reference function -- compute in real, cast to shortreal
+		function automatic shortreal ref_func(input shortreal x);
+			automatic real  xr = real'(x);
+			automatic real  yr;
+			if(xr >= 8.0)
+				return (FUNC == "gelu" || FUNC == "silu")? x : shortreal'(1.0);
+			if(xr <= -8.0)
+				return (FUNC == "tanh")? shortreal'(-1.0) : shortreal'(0.0);
+			if(FUNC == "gelu") begin
+				automatic real  t = $tanh($sqrt(2.0/3.14159265358979) * (xr + 0.044715*xr*xr*xr));
+				yr = 0.5 * xr * (1.0 + t);
+			end
+			else if(FUNC == "silu")  yr = xr / (1.0 + $exp(-xr));
+			else if(FUNC == "sigmoid")  yr = 1.0 / (1.0 + $exp(-xr));
+			else  yr = $tanh(xr);
+			return shortreal'(yr);
+		endfunction
+
+		// Online checking state
+		shortreal  ExpQ[$];
+
+		// Stimulus driver
+		initial begin
+			xdat = '0;
+			xvld = 0;
+			@(posedge clk iff !rst);
+
+			repeat(RUNS) begin
+				automatic logic [31:0]  vbits;
+
+				// Cover range [-8, 8) across all 5 octaves (exp 125..129)
+				vbits = 32'h40000000 + ($urandom() % 32'h01800000);  // [2.0, 6.0) range
+				if($urandom() % 2)  vbits[31] = 1;  // random sign
+				if($urandom() % 4 == 0) vbits = 32'h3F800000;  // 1.0
+				if($urandom() % 8 == 0) vbits = 32'h00000000;  // 0.0
+				if($urandom() % 8 == 0) vbits = 32'h40E00000 | ($urandom() % 32'h00100000);  // [7.0, 7.5)
+
+				while($urandom() % 17 == 0) @(posedge clk);
+
+				xdat <= vbits;
+				xvld <= 1;
+
+				@(posedge clk iff xrdy);
+				ExpQ.push_back(ref_func($bitstoshortreal(vbits)));
+
+				xvld <= 0;
+			end
+		end
+
+		always_ff @(posedge clk iff yvld && yrdy) begin
+			automatic shortreal  exp, err;
+			assert(ExpQ.size) else begin
+				$error("[%s] Spurious output.", FUNC);
+				$stop;
+			end
+			exp = ExpQ.pop_front();
+			err = y - exp;
+			err *= err;
+			assert((err < 1e-3) || ($shortrealtobits(y) == $shortrealtobits(exp))) else begin
+				$error("[%s] Output mismatch: %f/%08x instead of %f/%08x",
+					FUNC, y, $shortrealtobits(y), exp, $shortrealtobits(exp));
+				$stop;
+			end
+		end
+
+		// Output collector -- drives yrdy backpressure
+		initial begin
+			yrdy = 0;
+			@(posedge clk iff !rst);
+
+			repeat(RUNS) begin
+				while($urandom() % 17 == 0) @(posedge clk);
+				yrdy <= 1;
+				@(posedge clk iff yvld);
+				yrdy <= 0;
+			end
+
+			// Verify all expected outputs were consumed
+			@(posedge clk);
+			assert(ExpQ.size() == 0) else begin
+				$error("[%s] Missing %0d outputs.", FUNC, ExpQ.size());
+				$stop;
+			end
+
+			$display("PWPOLYF[%s]: %0d outputs verified online.", FUNC, RUNS);
+			done[t] = 1;
+		end
+
+	end : genTests
+
+endmodule : pwpolyf_tb
\ No newline at end of file
diff --git a/src/finn/custom_op/fpgadataflow/rtl/pwpolyf_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/pwpolyf_rtl.py
index d4736f7fee..dccc9e8240 100644
--- a/src/finn/custom_op/fpgadataflow/rtl/pwpolyf_rtl.py
+++ b/src/finn/custom_op/fpgadataflow/rtl/pwpolyf_rtl.py
@@ -43,6 +43,7 @@
     rtlsim_output_to_npy,
 )
 from finn.util.pwpolyf import (
+    CLAMP_CFG,
     NUM_OCTAVES,
     SUPPORTED_FUNCS,
     _fit_coefficients,
@@ -59,36 +60,64 @@ def _float_to_hex(f):
     return "%08X" % struct.unpack("!I", struct.pack("!f", float(f)))[0]
 
 
-def generate_coeffs_svh(K, num_samples=1000):
-    """Generate the pwpolyf_coeffs.svh file content for a given K value."""
+def generate_coeffs_pkg(K, num_samples=1000):
+    """Generate the pwpolyf_pkg.sv package content for a given K value.
+
+    Produces a SystemVerilog package with a func_cfg_t struct per activation
+    function, containing clamping parameters and polynomial coefficients.
+    """
+    degree = 2
     num_subs = 1 << K
     num_segs = 1 + 2 * NUM_OCTAVES * num_subs
 
     lines = []
     lines.append("// Auto-generated by pwpolyf_rtl.py — do not edit manually.")
-    lines.append("// K=%d, NUM_SEGS=%d, NUM_OCTAVES=%d, DEGREE=2" % (K, num_segs, NUM_OCTAVES))
+    lines.append(
+        "// DEGREE=%d K=%d NUM_OCTAVES=%d  Segments: %d"
+        % (degree, K, NUM_OCTAVES, num_segs)
+    )
+    lines.append("")
+    lines.append("package pwpolyf_pkg;")
+    lines.append("")
+    lines.append("    localparam int unsigned  DEGREE      = %d;" % degree)
+    lines.append("    localparam int unsigned  K           = %d;" % K)
+    lines.append("    localparam int unsigned  NUM_OCTAVES = %d;" % NUM_OCTAVES)
+    lines.append("    localparam int unsigned  NUM_SEGS    = %d;" % num_segs)
     lines.append("")
-    lines.append("localparam int unsigned PWPOLYF_K          = %d;" % K)
-    lines.append("localparam int unsigned PWPOLYF_NUM_SEGS   = %d;" % num_segs)
-    lines.append("localparam int unsigned PWPOLYF_NUM_OCTAVES = %d;" % NUM_OCTAVES)
-    lines.append("localparam int unsigned PWPOLYF_DEGREE     = 2;")
+    lines.append("    typedef struct {")
+    lines.append("        logic [31:0]  neg_clamp;")
+    lines.append("        logic [31:0]  pos_clamp;")
+    lines.append("        bit           pos_passthrough;")
+    lines.append("        logic [31:0]  coeffs[NUM_SEGS][DEGREE+1];")
+    lines.append("    } func_cfg_t;")
 
     for func_name in SUPPORTED_FUNCS:
+        cfg = CLAMP_CFG[func_name]
         coeffs = _fit_coefficients(func_name, K, num_samples)
-        label = "PWPOLYF_%s_COEFFS" % func_name.upper()
+        label = func_name.upper()
+        neg_hex = _float_to_hex(cfg["neg_clamp"])
+        pos_hex = _float_to_hex(cfg["pos_clamp"])
+        passthrough = 1 if cfg["pos_passthrough"] else 0
+
         lines.append("")
-        lines.append("localparam logic [31:0] %s[%d][3] = '{" % (label, num_segs))
+        lines.append("    localparam func_cfg_t  %s = '{" % label)
+        lines.append("        neg_clamp:       32'h%s," % neg_hex)
+        lines.append("        pos_clamp:       32'h%s," % pos_hex)
+        lines.append("        pos_passthrough: %d," % passthrough)
+        lines.append("        coeffs: '{")
         for seg in range(num_segs):
-            c0 = _float_to_hex(coeffs[seg, 0])
-            c1 = _float_to_hex(coeffs[seg, 1])
-            c2 = _float_to_hex(coeffs[seg, 2])
+            coeff_strs = []
+            for c in range(degree + 1):
+                coeff_strs.append("32'h%s" % _float_to_hex(coeffs[seg, c]))
             comma = "," if seg < num_segs - 1 else ""
             lines.append(
-                "    '{ 32'h%s, 32'h%s, 32'h%s }%s  // seg %d"
-                % (c0, c1, c2, comma, seg)
+                "            '{ %s }%s\t// seg %d" % (", ".join(coeff_strs), comma, seg)
             )
-        lines.append("};")
+        lines.append("        }")
+        lines.append("    };")
 
+    lines.append("")
+    lines.append("endpackage")
     lines.append("")
     return "\n".join(lines)
 
@@ -125,8 +154,8 @@ def prepare_codegen_rtl_values(self, model):
 
     def get_rtl_file_list(self):
         return [
+            "pwpolyf_pkg.sv",
             "pwpolyf.sv",
-            "pwpolyf_coeffs.svh",
             "queue.sv",
             "pwpolyf_template_wrapper.v",
         ]
@@ -166,11 +195,11 @@ def generate_hdl(self, model, fpgapart, clk):
             file_only_path = rtl_file_path.split("/")[-1]
             self.dump_rtl_data(code_gen_dir, file_only_path, data)
 
-        # generate coefficients .svh matching the node's K value
+        # generate package with coefficients matching the node's K value
         K = self.get_nodeattr("K")
-        svh_data = generate_coeffs_svh(K)
-        with open(os.path.join(code_gen_dir, "pwpolyf_coeffs.svh"), "w") as f:
-            f.write(svh_data)
+        pkg_data = generate_coeffs_pkg(K)
+        with open(os.path.join(code_gen_dir, "pwpolyf_pkg.sv"), "w") as f:
+            f.write(pkg_data)
 
         self.set_nodeattr("ipgen_path", code_gen_dir)
         self.set_nodeattr("ip_path", code_gen_dir)
@@ -181,11 +210,9 @@ def prepare_rtlsim(self):
 
         code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
         verilog_paths = [code_gen_dir]
-        # exclude .svh — it is pulled in via `include from pwpolyf.sv
         verilog_files = [
             x.replace("pwpolyf_template_wrapper", self.get_nodeattr("gen_top_module"))
             for x in self.get_rtl_file_list()
-            if not x.endswith(".svh")
         ]
         single_src_dir = make_build_dir("pyverilator_" + self.onnx_node.name + "_")
 
diff --git a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py
index 2427a4514a..c97267bb54 100644
--- a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py
+++ b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2023-2024, Advanced Micro Devices, Inc.
+# Copyright (C) 2023-2026, Advanced Micro Devices, Inc.
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -275,8 +275,7 @@ def apply(self, model):
 
 
 class InferPWPolyFLayer(Transformation):
-    """Convert PWPolyF custom ops and standard ONNX activations (Gelu, Sigmoid,
-    Tanh, SiLU pattern) into piecewise polynomial HW layers."""
+    """Convert activations to piecewise polynomial HW layers."""
 
     _SINGLE_OP_MAP = {"Gelu": "gelu", "Tanh": "tanh"}
 
@@ -292,18 +291,12 @@ def _is_const_scalar(model, tensor_name, value, tol=1e-3):
         return init.size == 1 and abs(float(init.flat[0]) - value) < tol
 
     def _match_erf_gelu(self, model, erf_node):
-        """Try to match the Erf-based GELU decomposition rooted at *erf_node*.
-
-        Pattern (opset < 20):
-            Div(x, sqrt(2)) → Erf → Add(_, 1) → Mul(0.5, _) → Mul(x, _)
-
-        Returns (pwp_input, pwp_output, nodes_to_remove) on success, else None.
-        """
-        # --- backward: Erf input must come from Div(x, sqrt(2)) ---
+        """Match Erf-based GELU: Div(x,sqrt(2))→Erf→Add(_,1)→Mul(0.5,_)→Mul(x,_).
+        Returns (pwp_input, pwp_output, nodes_to_remove) or None."""
+        # backward: Erf input must come from Div(x, sqrt(2))
         div_node = model.find_producer(erf_node.input[0])
         if div_node is None or div_node.op_type != "Div":
             return None
-        # one Div input is x, the other is sqrt(2) ≈ 1.4142
         if self._is_const_scalar(model, div_node.input[1], 1.4142135):
             gelu_input = div_node.input[0]
         elif self._is_const_scalar(model, div_node.input[0], 1.4142135):
@@ -311,7 +304,7 @@ def _match_erf_gelu(self, model, erf_node):
         else:
             return None
 
-        # --- forward: Erf → Add(_, 1) ---
+        # forward: Erf → Add(_, 1)
         erf_consumers = model.find_consumers(erf_node.output[0])
         if len(erf_consumers) != 1 or erf_consumers[0].op_type != "Add":
             return None
@@ -320,7 +313,7 @@ def _match_erf_gelu(self, model, erf_node):
         if len(other_add) != 1 or not self._is_const_scalar(model, other_add[0], 1.0):
             return None
 
-        # --- Add → Mul(0.5, _) ---
+        # Add → Mul(0.5, _)
         add_consumers = model.find_consumers(add_node.output[0])
         if len(add_consumers) != 1 or add_consumers[0].op_type != "Mul":
             return None
@@ -329,7 +322,7 @@ def _match_erf_gelu(self, model, erf_node):
         if len(other_mul_half) != 1 or not self._is_const_scalar(model, other_mul_half[0], 0.5):
             return None
 
-        # --- Mul(0.5,_) → Mul(x, _) ---
+        # Mul(0.5,_) → Mul(x, _)
         half_consumers = model.find_consumers(mul_half.output[0])
         if len(half_consumers) != 1 or half_consumers[0].op_type != "Mul":
             return None

From 1f6c5eb1dcf6ee5543effbc9f6d5456f246d906a Mon Sep 17 00:00:00 2001
From: ollycassidy13 <ollyj.cassidy@gmail.com>
Date: Fri, 24 Apr 2026 14:27:14 +0100
Subject: [PATCH 04/12] pkg changes

---
 finn-rtllib/pwpolyf/hdl/pwpolyf_pkg.sv             | 14 ++++++--------
 src/finn/custom_op/fpgadataflow/rtl/pwpolyf_rtl.py |  6 +++---
 2 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/finn-rtllib/pwpolyf/hdl/pwpolyf_pkg.sv b/finn-rtllib/pwpolyf/hdl/pwpolyf_pkg.sv
index 2838f03fe2..fc2d3ace3b 100644
--- a/finn-rtllib/pwpolyf/hdl/pwpolyf_pkg.sv
+++ b/finn-rtllib/pwpolyf/hdl/pwpolyf_pkg.sv
@@ -1,7 +1,5 @@
 /****************************************************************************
- * Copyright (C) 2026, Advanced Micro Devices, Inc.
- * All rights reserved.
- *
+ * Copyright Advanced Micro Devices, Inc.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * @brief	Per-activation configuration for pwpolyf.
@@ -12,7 +10,7 @@
  *	configuration (clamping parameters + coefficient arrays) for the
  *	piecewise polynomial activation unit.
  *
- *	Coefficient data auto-generated by pwpolyf_coeffs.py -- DEGREE=2 K=3
+ *	Coefficient data auto-generated by pwpolyf_rtl.py -- DEGREE=2 K=3
  *	NUM_OCTAVES=5.  Segments: 81  Coefficients per segment: 3
  *	Polynomial: y = a_0 + a_1*x + a_2*x^2 + ... + a_d*x^d
  *	Horner form: y = a_0 + x*(a_1 + x*(a_2 + ... x*a_d))
@@ -30,10 +28,10 @@ package pwpolyf_pkg;
 	localparam int unsigned  NUM_SEGS    = 81;
 
 	typedef struct {
-		logic [31:0]  neg_clamp;
-		logic [31:0]  pos_clamp;
-		bit           pos_passthrough;
-		logic [31:0]  coeffs[NUM_SEGS][DEGREE+1];
+		int unsigned  neg_clamp;
+		int unsigned  pos_clamp;
+		bit	      pos_passthrough;
+		int unsigned  coeffs[NUM_SEGS][DEGREE+1];
 	} func_cfg_t;
 
 	localparam func_cfg_t  GELU = '{
diff --git a/src/finn/custom_op/fpgadataflow/rtl/pwpolyf_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/pwpolyf_rtl.py
index dccc9e8240..8045a94dbc 100644
--- a/src/finn/custom_op/fpgadataflow/rtl/pwpolyf_rtl.py
+++ b/src/finn/custom_op/fpgadataflow/rtl/pwpolyf_rtl.py
@@ -85,10 +85,10 @@ def generate_coeffs_pkg(K, num_samples=1000):
     lines.append("    localparam int unsigned  NUM_SEGS    = %d;" % num_segs)
     lines.append("")
     lines.append("    typedef struct {")
-    lines.append("        logic [31:0]  neg_clamp;")
-    lines.append("        logic [31:0]  pos_clamp;")
+    lines.append("        int unsigned  neg_clamp;")
+    lines.append("        int unsigned  pos_clamp;")
     lines.append("        bit           pos_passthrough;")
-    lines.append("        logic [31:0]  coeffs[NUM_SEGS][DEGREE+1];")
+    lines.append("        int unsigned  coeffs[NUM_SEGS][DEGREE+1];")
     lines.append("    } func_cfg_t;")
 
     for func_name in SUPPORTED_FUNCS:

From 6d01b10309f89f8264ef3b8dcdee4f581dd38baa Mon Sep 17 00:00:00 2001
From: ollycassidy13 <ollyj.cassidy@gmail.com>
Date: Fri, 24 Apr 2026 16:03:23 +0100
Subject: [PATCH 05/12] linting

---
 finn-rtllib/pwpolyf/hdl/pwpolyf.abc           |  2 +-
 finn-rtllib/pwpolyf/hdl/pwpolyf.sv            |  2 +-
 finn-rtllib/pwpolyf/hdl/pwpolyf_pkg.sv        |  2 +-
 finn-rtllib/pwpolyf/hdl/pwpolyf_tb.sv         |  2 +-
 src/finn/custom_op/fpgadataflow/pwpolyf.py    |  5 +-
 .../custom_op/fpgadataflow/rtl/__init__.py    |  2 +-
 .../custom_op/fpgadataflow/rtl/pwpolyf_rtl.py | 18 +---
 .../fpgadataflow/convert_to_hw_layers.py      | 83 +++++++++----------
 src/finn/util/pwpolyf.py                      | 22 ++---
 .../fpgadataflow/test_fpgadataflow_pwpolyf.py | 77 ++++++++++-------
 10 files changed, 111 insertions(+), 104 deletions(-)

diff --git a/finn-rtllib/pwpolyf/hdl/pwpolyf.abc b/finn-rtllib/pwpolyf/hdl/pwpolyf.abc
index 06b77b967d..c25b5fda3d 100644
--- a/finn-rtllib/pwpolyf/hdl/pwpolyf.abc
+++ b/finn-rtllib/pwpolyf/hdl/pwpolyf.abc
@@ -2,4 +2,4 @@ import  queue
 read_sv pwpolyf_pkg.sv
 read_sv pwpolyf.sv
 setup_tb  pwpolyf_tb
-setup_top pwpolyf
\ No newline at end of file
+setup_top pwpolyf
diff --git a/finn-rtllib/pwpolyf/hdl/pwpolyf.sv b/finn-rtllib/pwpolyf/hdl/pwpolyf.sv
index a2257fe17f..32c0b5ea6b 100644
--- a/finn-rtllib/pwpolyf/hdl/pwpolyf.sv
+++ b/finn-rtllib/pwpolyf/hdl/pwpolyf.sv
@@ -323,4 +323,4 @@ module pwpolyf #(
 		end
 	end
 
-endmodule : pwpolyf
\ No newline at end of file
+endmodule : pwpolyf
diff --git a/finn-rtllib/pwpolyf/hdl/pwpolyf_pkg.sv b/finn-rtllib/pwpolyf/hdl/pwpolyf_pkg.sv
index fc2d3ace3b..cdf479355e 100644
--- a/finn-rtllib/pwpolyf/hdl/pwpolyf_pkg.sv
+++ b/finn-rtllib/pwpolyf/hdl/pwpolyf_pkg.sv
@@ -390,4 +390,4 @@ package pwpolyf_pkg;
 		}
 	};
 
-endpackage
\ No newline at end of file
+endpackage
diff --git a/finn-rtllib/pwpolyf/hdl/pwpolyf_tb.sv b/finn-rtllib/pwpolyf/hdl/pwpolyf_tb.sv
index 574b073b3e..f98929e2ab 100644
--- a/finn-rtllib/pwpolyf/hdl/pwpolyf_tb.sv
+++ b/finn-rtllib/pwpolyf/hdl/pwpolyf_tb.sv
@@ -142,4 +142,4 @@ module pwpolyf_tb;
 
 	end : genTests
 
-endmodule : pwpolyf_tb
\ No newline at end of file
+endmodule : pwpolyf_tb
diff --git a/src/finn/custom_op/fpgadataflow/pwpolyf.py b/src/finn/custom_op/fpgadataflow/pwpolyf.py
index e05ba9c2aa..d412e1669a 100644
--- a/src/finn/custom_op/fpgadataflow/pwpolyf.py
+++ b/src/finn/custom_op/fpgadataflow/pwpolyf.py
@@ -177,8 +177,9 @@ def execute_node(self, context, graph):
         K = self.get_nodeattr("K")
 
         # lazy import to avoid hard dependency on torch at module level
-        import torch
-        from finn.util.pwpolyf import PiecewisePolyActivation
+        import torch  # noqa: PLC0415
+
+        from finn.util.pwpolyf import PiecewisePolyActivation  # noqa: PLC0415
 
         mod = PiecewisePolyActivation(func, K=K)
         with torch.no_grad():
diff --git a/src/finn/custom_op/fpgadataflow/rtl/__init__.py b/src/finn/custom_op/fpgadataflow/rtl/__init__.py
index 77c1cb374d..053b8e8f02 100644
--- a/src/finn/custom_op/fpgadataflow/rtl/__init__.py
+++ b/src/finn/custom_op/fpgadataflow/rtl/__init__.py
@@ -39,12 +39,12 @@
 from finn.custom_op.fpgadataflow.rtl.inner_shuffle_rtl import InnerShuffle_rtl
 from finn.custom_op.fpgadataflow.rtl.layernorm_rtl import LayerNorm_rtl
 from finn.custom_op.fpgadataflow.rtl.matrixvectoractivation_rtl import MVAU_rtl
+from finn.custom_op.fpgadataflow.rtl.pwpolyf_rtl import PWPolyF_rtl
 from finn.custom_op.fpgadataflow.rtl.requant_rtl import Requant_rtl
 from finn.custom_op.fpgadataflow.rtl.streamingdatawidthconverter_rtl import (
     StreamingDataWidthConverter_rtl,
 )
 from finn.custom_op.fpgadataflow.rtl.streamingfifo_rtl import StreamingFIFO_rtl
-from finn.custom_op.fpgadataflow.rtl.pwpolyf_rtl import PWPolyF_rtl
 from finn.custom_op.fpgadataflow.rtl.thresholding_rtl import Thresholding_rtl
 from finn.custom_op.fpgadataflow.rtl.vectorvectoractivation_rtl import VVAU_rtl
 
diff --git a/src/finn/custom_op/fpgadataflow/rtl/pwpolyf_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/pwpolyf_rtl.py
index 2935b37912..6bd80dd0df 100644
--- a/src/finn/custom_op/fpgadataflow/rtl/pwpolyf_rtl.py
+++ b/src/finn/custom_op/fpgadataflow/rtl/pwpolyf_rtl.py
@@ -32,12 +32,7 @@
 
 from finn.custom_op.fpgadataflow.pwpolyf import PWPolyF
 from finn.custom_op.fpgadataflow.rtlbackend import RTLBackend
-from finn.util.pwpolyf import (
-    CLAMP_CFG,
-    NUM_OCTAVES,
-    SUPPORTED_FUNCS,
-    _fit_coefficients,
-)
+from finn.util.pwpolyf import CLAMP_CFG, NUM_OCTAVES, SUPPORTED_FUNCS, _fit_coefficients
 
 
 def _float_to_hex(f):
@@ -58,8 +53,7 @@ def generate_coeffs_pkg(K, num_samples=1000):
     lines = []
     lines.append("// Auto-generated by pwpolyf_rtl.py — do not edit manually.")
     lines.append(
-        "// DEGREE=%d K=%d NUM_OCTAVES=%d  Segments: %d"
-        % (degree, K, NUM_OCTAVES, num_segs)
+        "// DEGREE=%d K=%d NUM_OCTAVES=%d  Segments: %d" % (degree, K, NUM_OCTAVES, num_segs)
     )
     lines.append("")
     lines.append("package pwpolyf_pkg;")
@@ -95,9 +89,7 @@ def generate_coeffs_pkg(K, num_samples=1000):
             for c in range(degree + 1):
                 coeff_strs.append("32'h%s" % _float_to_hex(coeffs[seg, c]))
             comma = "," if seg < num_segs - 1 else ""
-            lines.append(
-                "            '{ %s }%s\t// seg %d" % (", ".join(coeff_strs), comma, seg)
-            )
+            lines.append("            '{ %s }%s\t// seg %d" % (", ".join(coeff_strs), comma, seg))
         lines.append("        }")
         lines.append("    };")
 
@@ -163,9 +155,7 @@ def generate_hdl(self, model, fpgapart, clk):
     def get_rtl_file_list(self, abspath=False):
         if abspath:
             code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + "/"
-            rtllib_dir = os.path.join(
-                os.environ["FINN_ROOT"], "finn-rtllib/pwpolyf/hdl/"
-            )
+            rtllib_dir = os.path.join(os.environ["FINN_ROOT"], "finn-rtllib/pwpolyf/hdl/")
         else:
             code_gen_dir = ""
             rtllib_dir = ""
diff --git a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py
index 9b78cb6e40..73f2dadb2d 100644
--- a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py
+++ b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py
@@ -354,8 +354,13 @@ def apply(self, model):
                 K = K_attr.i if K_attr is not None else 3
 
                 new_node = self._make_pwpolyf_node(
-                    pwp_input, pwp_output, func, pwp_in_shape, idt,
-                    "PWPolyF_" + node.name, K,
+                    pwp_input,
+                    pwp_output,
+                    func,
+                    pwp_in_shape,
+                    idt,
+                    "PWPolyF_" + node.name,
+                    K,
                 )
                 graph.node.insert(node_ind, new_node)
                 graph.node.remove(node)
@@ -374,7 +379,11 @@ def apply(self, model):
 
                 func = self._SINGLE_OP_MAP[node.op_type]
                 new_node = self._make_pwpolyf_node(
-                    pwp_input, pwp_output, func, pwp_in_shape, idt,
+                    pwp_input,
+                    pwp_output,
+                    func,
+                    pwp_in_shape,
+                    idt,
                     "PWPolyF_" + node.name,
                 )
                 graph.node.insert(node_ind, new_node)
@@ -410,7 +419,11 @@ def apply(self, model):
                             nodes_to_remove.append(mul_cand)
 
                 new_node = self._make_pwpolyf_node(
-                    sig_input, pwp_output, func, pwp_in_shape, idt,
+                    sig_input,
+                    pwp_output,
+                    func,
+                    pwp_in_shape,
+                    idt,
                     "PWPolyF_" + node.name,
                 )
                 graph.node.insert(node_ind, new_node)
@@ -433,7 +446,11 @@ def apply(self, model):
                     continue
 
                 new_node = self._make_pwpolyf_node(
-                    pwp_input, pwp_output, "gelu", pwp_in_shape, idt,
+                    pwp_input,
+                    pwp_output,
+                    "gelu",
+                    pwp_in_shape,
+                    idt,
                     "PWPolyF_" + node.name,
                 )
                 graph.node.insert(node_ind, new_node)
@@ -1568,13 +1585,11 @@ def apply(self, model):
                 mm_in_shape = model.get_tensor_shape(mm_input)
                 mm_out_shape = model.get_tensor_shape(mm_output)
                 assert model.get_tensor_datatype(mm_input) == DataType["BINARY"], (
-                    n.name
-                    + """: First
+                    n.name + """: First
                 input for xnorpopcount is not Wset to FINN DataType BINARY."""
                 )
                 assert model.get_tensor_datatype(mm_weight) == DataType["BINARY"], (
-                    n.name
-                    + """: Second
+                    n.name + """: Second
                 input (weights) for xnorpopcount is not set to FINN DataType BINARY."""
                 )
                 idt = DataType["BINARY"]
@@ -1592,8 +1607,7 @@ def apply(self, model):
                 simd = 1
                 wmem = mw * mh // (pe * simd)
                 assert mw * mh == wmem * pe * simd, (
-                    n.name
-                    + """: Requirement (MW * MH) divisiable by
+                    n.name + """: Requirement (MW * MH) divisiable by
                 (WMEM * PE * SIMD) is violated."""
                 )
                 # see if we have any following thresholds
@@ -1606,8 +1620,7 @@ def apply(self, model):
                     mt_thres = consumer.input[1]
                     T = model.get_initializer(mt_thres)
                     assert T.shape[0] == 1 or T.shape[0] == mh, (
-                        consumer.name
-                        + """: First dimension of
+                        consumer.name + """: First dimension of
                     thresholds neither 1 nor MH."""
                     )
                     odt = model.get_tensor_datatype(mt_output)
@@ -1719,8 +1732,7 @@ def apply(self, model):
                     simd = 1
                     wmem = mw * mh // (pe * simd)
                     assert mw * mh == wmem * pe * simd, (
-                        n.name
-                        + """: Requirement (MW * MH) divisible by
+                        n.name + """: Requirement (MW * MH) divisible by
                     (WMEM * PE * SIMD) is violated."""
                     )
                     # see if we have any following thresholds
@@ -1733,8 +1745,7 @@ def apply(self, model):
                         mt_thres = consumer.input[1]
                         T = model.get_initializer(mt_thres)
                         assert T.shape[0] == 1 or T.shape[0] == mh, (
-                            consumer.name
-                            + """: First dimension of
+                            consumer.name + """: First dimension of
                         thresholds neither 1 nor MH."""
                         )
                         odt = model.get_tensor_datatype(mt_output)
@@ -1843,11 +1854,8 @@ def apply(self, model):
                 try:
                     k_h, k_w = sparsity["dw"]["kernel_shape"]
                 except KeyError:
-                    raise Exception(
-                        n.name
-                        + """: sparsity annotation doesn't indicate that MatMul
-                        belongs to a depthwise convolution."""
-                    )
+                    raise Exception(n.name + """: sparsity annotation doesn't indicate that MatMul
+                        belongs to a depthwise convolution.""")
 
                 mm_input = n.input[0]
                 mm_weight = n.input[1]
@@ -1890,8 +1898,7 @@ def apply(self, model):
                         mt_thres = consumer.input[1]
                         T = model.get_initializer(mt_thres)
                         assert T.shape[0] == 1 or T.shape[0] == channels, (
-                            consumer.name
-                            + """: First dimension of
+                            consumer.name + """: First dimension of
                         thresholds neither 1 nor Channels."""
                         )
                         odt = model.get_tensor_datatype(mt_output)
@@ -2076,9 +2083,7 @@ def apply(self, model):
                         to_remove.append(consumer)
 
                 # Handle None shapes (shape inference might have failed)
-                assert (
-                    in_reshaped is not None
-                ), f"""Could not infer shape for tensor {n.input[0]}.
+                assert in_reshaped is not None, f"""Could not infer shape for tensor {n.input[0]}.
                     Please run InferShapes first"""
                 assert (
                     out_reshaped is not None
@@ -2090,28 +2095,22 @@ def apply(self, model):
 
                 # Some sanity checks for the transformation
                 if idt != odt:
-                    raise RuntimeError(
-                        """
+                    raise RuntimeError("""
                     Input datatype and output datatype of the shuffle must be the same,
                     did something go wrong during transformation?
-                    """
-                    )
+                    """)
 
                 if len(perm.ints) != len(in_reshaped):
-                    raise RuntimeError(
-                        f"""
+                    raise RuntimeError(f"""
                     Permutation list {perm.ints=} does not match the reshaped input dimension
                     {in_reshaped=}
-                    """
-                    )
+                    """)
 
                 if len(perm.ints) != len(out_shape):
-                    raise RuntimeError(
-                        f"""
+                    raise RuntimeError(f"""
                     Permutation list {perm.ints=} does not match the reshaped out dimension
                     {out_reshaped=}
-                    """
-                    )
+                    """)
 
                 simd = 1
 
@@ -2417,10 +2416,8 @@ def apply(self, model):
                 scale_is_one = (scale == 1).all()
                 bias_is_zero = not np.any(bias)
                 if not (scale_is_one and (bias_is_zero or bias is not None)):
-                    warnings.warn(
-                        f"""{node.name}: Scale is not one or bias is not zero.
-                        Can't be converted to HWCustomOp. Please run ExtractNormScaleBias first."""
-                    )
+                    warnings.warn(f"""{node.name}: Scale is not one or bias is not zero.
+                        Can't be converted to HWCustomOp. Please run ExtractNormScaleBias first.""")
                     continue
                 act_in = node.input[0]
                 act_out = node.output[0]
diff --git a/src/finn/util/pwpolyf.py b/src/finn/util/pwpolyf.py
index 484cfde85c..da3f65e246 100644
--- a/src/finn/util/pwpolyf.py
+++ b/src/finn/util/pwpolyf.py
@@ -55,10 +55,10 @@
 }
 
 CLAMP_CFG = {
-    "gelu":    {"neg_clamp": 0.0, "pos_clamp": 0.0, "pos_passthrough": True},
-    "silu":    {"neg_clamp": 0.0, "pos_clamp": 0.0, "pos_passthrough": True},
+    "gelu": {"neg_clamp": 0.0, "pos_clamp": 0.0, "pos_passthrough": True},
+    "silu": {"neg_clamp": 0.0, "pos_clamp": 0.0, "pos_passthrough": True},
     "sigmoid": {"neg_clamp": 0.0, "pos_clamp": 1.0, "pos_passthrough": False},
-    "tanh":    {"neg_clamp": -1.0, "pos_clamp": 1.0, "pos_passthrough": False},
+    "tanh": {"neg_clamp": -1.0, "pos_clamp": 1.0, "pos_passthrough": False},
 }
 
 
@@ -73,7 +73,7 @@ def _segment_boundaries(K):
     # Positive segments
     for octave in range(NUM_OCTAVES):
         exp_val = EXP_BASE + octave - EXP_BIAS
-        base = 2.0 ** exp_val
+        base = 2.0**exp_val
         for sub in range(num_subs):
             lo = base * (1.0 + sub / num_subs)
             hi = base * (1.0 + (sub + 1) / num_subs)
@@ -82,7 +82,7 @@ def _segment_boundaries(K):
     # Negative segments (mirror of positive)
     for octave in range(NUM_OCTAVES):
         exp_val = EXP_BASE + octave - EXP_BIAS
-        base = 2.0 ** exp_val
+        base = 2.0**exp_val
         for sub in range(num_subs):
             lo = base * (1.0 + sub / num_subs)
             hi = base * (1.0 + (sub + 1) / num_subs)
@@ -151,9 +151,7 @@ def forward(ctx, x, coeffs, neg_clamp_val, pos_clamp_val, func, K):
         orig_shape = x.shape
         x_flat = x.contiguous().view(-1)
 
-        seg_idx, is_neg_clamp, is_pos_clamp = _segment_index(
-            x_flat, K, num_subs, num_segs
-        )
+        seg_idx, is_neg_clamp, is_pos_clamp = _segment_index(x_flat, K, num_subs, num_segs)
 
         c = coeffs[seg_idx]
         a0 = c[:, 0]
@@ -207,8 +205,12 @@ def __init__(self, func="gelu", K=3, fit_samples=1000):
     def forward(self, x):
         if torch.onnx.is_in_onnx_export():
             return PWPolyFFunction.apply(
-                x, self.coeffs, self.neg_clamp_val, self.pos_clamp_val,
-                self.func, self.K,
+                x,
+                self.coeffs,
+                self.neg_clamp_val,
+                self.pos_clamp_val,
+                self.func,
+                self.K,
             )
 
         orig_shape = x.shape
diff --git a/tests/fpgadataflow/test_fpgadataflow_pwpolyf.py b/tests/fpgadataflow/test_fpgadataflow_pwpolyf.py
index e491d82eba..a36117b90d 100644
--- a/tests/fpgadataflow/test_fpgadataflow_pwpolyf.py
+++ b/tests/fpgadataflow/test_fpgadataflow_pwpolyf.py
@@ -41,20 +41,16 @@
 
 import finn.core.onnx_exec as oxe
 from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
-from finn.util.pwpolyf import PiecewisePolyActivation
 from finn.transformation.fpgadataflow.convert_to_hw_layers import InferPWPolyFLayer
 from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers
+from finn.util.pwpolyf import PiecewisePolyActivation
 
 test_fpga_part = "xczu3eg-sbva484-1-e"
 
 
 def make_pwpolyf_modelwrapper(func, K, num_channels, num_input_vecs):
-    inp = helper.make_tensor_value_info(
-        "inp", TensorProto.FLOAT, num_input_vecs + [num_channels]
-    )
-    outp = helper.make_tensor_value_info(
-        "outp", TensorProto.FLOAT, num_input_vecs + [num_channels]
-    )
+    inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, num_input_vecs + [num_channels])
+    outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, num_input_vecs + [num_channels])
 
     pwpolyf_node = helper.make_node(
         "PWPolyF",
@@ -131,11 +127,16 @@ def test_pwpolyf_onnx_export(func):
         tmpf = f.name
     try:
         torch.onnx.export(
-            mod, dummy, tmpf,
-            input_names=["input"], output_names=["output"],
-            opset_version=13, dynamo=False,
+            mod,
+            dummy,
+            tmpf,
+            input_names=["input"],
+            output_names=["output"],
+            opset_version=13,
+            dynamo=False,
         )
-        import onnx
+        import onnx  # noqa: PLC0415
+
         onnx_model = onnx.load(tmpf)
     finally:
         os.unlink(tmpf)
@@ -161,9 +162,13 @@ def test_pwpolyf_infer_transform(func):
         tmpf = f.name
     try:
         torch.onnx.export(
-            mod, dummy, tmpf,
-            input_names=["inp"], output_names=["outp"],
-            opset_version=13, dynamo=False,
+            mod,
+            dummy,
+            tmpf,
+            input_names=["inp"],
+            output_names=["outp"],
+            opset_version=13,
+            dynamo=False,
         )
         model = ModelWrapper(tmpf)
     finally:
@@ -311,7 +316,10 @@ def make_silu_pattern_model(num_channels, num_input_vecs):
     mul_node = helper.make_node("Mul", ["inp", "sig_out"], ["outp"], name="Mul_0")
 
     graph = helper.make_graph(
-        [sigmoid_node, mul_node], "silu_graph", [inp], [outp],
+        [sigmoid_node, mul_node],
+        "silu_graph",
+        [inp],
+        [outp],
     )
     model = helper.make_model(graph, producer_name="test")
     model = ModelWrapper(model)
@@ -343,7 +351,9 @@ def make_erf_gelu_model(num_channels, num_input_vecs):
 
     graph = helper.make_graph(
         [div_node, erf_node, add_node, mul_half_node, mul_x_node],
-        "erf_gelu_graph", [inp], [outp],
+        "erf_gelu_graph",
+        [inp],
+        [outp],
         initializer=[sqrt2, one, half],
     )
     model = helper.make_model(graph, producer_name="test")
@@ -356,16 +366,18 @@ def make_erf_gelu_model(num_channels, num_input_vecs):
 # ---------- standard ONNX op inference tests ----------
 
 
-@pytest.mark.parametrize("op_type,expected_func", [
-    ("Gelu", "gelu"),
-    ("Sigmoid", "sigmoid"),
-    ("Tanh", "tanh"),
-])
+@pytest.mark.parametrize(
+    "op_type,expected_func",
+    [
+        ("Gelu", "gelu"),
+        ("Sigmoid", "sigmoid"),
+        ("Tanh", "tanh"),
+    ],
+)
 @pytest.mark.parametrize("num_channels", [4, 16])
 @pytest.mark.parametrize("num_input_vecs", [[1], [1, 2, 2]])
 @pytest.mark.fpgadataflow
-def test_pwpolyf_infer_standard_op(op_type, expected_func,
-                                   num_channels, num_input_vecs):
+def test_pwpolyf_infer_standard_op(op_type, expected_func, num_channels, num_input_vecs):
     model = make_standard_activation_model(op_type, num_channels, num_input_vecs)
 
     assert model.graph.node[0].op_type == op_type
@@ -449,8 +461,10 @@ def test_pwpolyf_sigmoid_multi_consumer_no_silu():
     identity_node = helper.make_node("Identity", ["sig_out"], ["outp2"], name="Id_0")
 
     graph = helper.make_graph(
-        [sigmoid_node, mul_node, identity_node], "test_graph",
-        [inp], [outp1, outp2],
+        [sigmoid_node, mul_node, identity_node],
+        "test_graph",
+        [inp],
+        [outp1, outp2],
     )
     model = helper.make_model(graph, producer_name="test")
     model = ModelWrapper(model)
@@ -469,11 +483,14 @@ def test_pwpolyf_sigmoid_multi_consumer_no_silu():
     assert any(n.op_type == "Identity" for n in model.graph.node)
 
 
-@pytest.mark.parametrize("op_type,expected_func", [
-    ("Gelu", "gelu"),
-    ("Sigmoid", "sigmoid"),
-    ("Tanh", "tanh"),
-])
+@pytest.mark.parametrize(
+    "op_type,expected_func",
+    [
+        ("Gelu", "gelu"),
+        ("Sigmoid", "sigmoid"),
+        ("Tanh", "tanh"),
+    ],
+)
 @pytest.mark.fpgadataflow
 def test_pwpolyf_standard_op_execution(op_type, expected_func):
     num_channels = 16

From adc14f2f706a74921d30023928c04a4078395b5a Mon Sep 17 00:00:00 2001
From: ollycassidy13 <ollyj.cassidy@gmail.com>
Date: Fri, 24 Apr 2026 16:29:52 +0100
Subject: [PATCH 06/12] linting

---
 .../fpgadataflow/convert_to_hw_layers.py      | 56 +++++++++++++------
 1 file changed, 38 insertions(+), 18 deletions(-)

diff --git a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py
index 73f2dadb2d..3f714d7ae7 100644
--- a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py
+++ b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py
@@ -1585,11 +1585,13 @@ def apply(self, model):
                 mm_in_shape = model.get_tensor_shape(mm_input)
                 mm_out_shape = model.get_tensor_shape(mm_output)
                 assert model.get_tensor_datatype(mm_input) == DataType["BINARY"], (
-                    n.name + """: First
+                    n.name
+                    + """: First
                 input for xnorpopcount is not Wset to FINN DataType BINARY."""
                 )
                 assert model.get_tensor_datatype(mm_weight) == DataType["BINARY"], (
-                    n.name + """: Second
+                    n.name
+                    + """: Second
                 input (weights) for xnorpopcount is not set to FINN DataType BINARY."""
                 )
                 idt = DataType["BINARY"]
@@ -1607,7 +1609,8 @@ def apply(self, model):
                 simd = 1
                 wmem = mw * mh // (pe * simd)
                 assert mw * mh == wmem * pe * simd, (
-                    n.name + """: Requirement (MW * MH) divisiable by
+                    n.name
+                    + """: Requirement (MW * MH) divisiable by
                 (WMEM * PE * SIMD) is violated."""
                 )
                 # see if we have any following thresholds
@@ -1620,7 +1623,8 @@ def apply(self, model):
                     mt_thres = consumer.input[1]
                     T = model.get_initializer(mt_thres)
                     assert T.shape[0] == 1 or T.shape[0] == mh, (
-                        consumer.name + """: First dimension of
+                        consumer.name
+                        + """: First dimension of
                     thresholds neither 1 nor MH."""
                     )
                     odt = model.get_tensor_datatype(mt_output)
@@ -1732,7 +1736,8 @@ def apply(self, model):
                     simd = 1
                     wmem = mw * mh // (pe * simd)
                     assert mw * mh == wmem * pe * simd, (
-                        n.name + """: Requirement (MW * MH) divisible by
+                        n.name
+                        + """: Requirement (MW * MH) divisible by
                     (WMEM * PE * SIMD) is violated."""
                     )
                     # see if we have any following thresholds
@@ -1745,7 +1750,8 @@ def apply(self, model):
                         mt_thres = consumer.input[1]
                         T = model.get_initializer(mt_thres)
                         assert T.shape[0] == 1 or T.shape[0] == mh, (
-                            consumer.name + """: First dimension of
+                            consumer.name
+                            + """: First dimension of
                         thresholds neither 1 nor MH."""
                         )
                         odt = model.get_tensor_datatype(mt_output)
@@ -1854,8 +1860,11 @@ def apply(self, model):
                 try:
                     k_h, k_w = sparsity["dw"]["kernel_shape"]
                 except KeyError:
-                    raise Exception(n.name + """: sparsity annotation doesn't indicate that MatMul
-                        belongs to a depthwise convolution.""")
+                    raise Exception(
+                        n.name
+                        + """: sparsity annotation doesn't indicate that MatMul
+                        belongs to a depthwise convolution."""
+                    )
 
                 mm_input = n.input[0]
                 mm_weight = n.input[1]
@@ -1898,7 +1907,8 @@ def apply(self, model):
                         mt_thres = consumer.input[1]
                         T = model.get_initializer(mt_thres)
                         assert T.shape[0] == 1 or T.shape[0] == channels, (
-                            consumer.name + """: First dimension of
+                            consumer.name
+                            + """: First dimension of
                         thresholds neither 1 nor Channels."""
                         )
                         odt = model.get_tensor_datatype(mt_output)
@@ -2083,7 +2093,9 @@ def apply(self, model):
                         to_remove.append(consumer)
 
                 # Handle None shapes (shape inference might have failed)
-                assert in_reshaped is not None, f"""Could not infer shape for tensor {n.input[0]}.
+                assert (
+                    in_reshaped is not None
+                ), f"""Could not infer shape for tensor {n.input[0]}.
                     Please run InferShapes first"""
                 assert (
                     out_reshaped is not None
@@ -2095,22 +2107,28 @@ def apply(self, model):
 
                 # Some sanity checks for the transformation
                 if idt != odt:
-                    raise RuntimeError("""
+                    raise RuntimeError(
+                        """
                     Input datatype and output datatype of the shuffle must be the same,
                     did something go wrong during transformation?
-                    """)
+                    """
+                    )
 
                 if len(perm.ints) != len(in_reshaped):
-                    raise RuntimeError(f"""
+                    raise RuntimeError(
+                        f"""
                     Permutation list {perm.ints=} does not match the reshaped input dimension
                     {in_reshaped=}
-                    """)
+                    """
+                    )
 
                 if len(perm.ints) != len(out_shape):
-                    raise RuntimeError(f"""
+                    raise RuntimeError(
+                        f"""
                     Permutation list {perm.ints=} does not match the reshaped out dimension
                     {out_reshaped=}
-                    """)
+                    """
+                    )
 
                 simd = 1
 
@@ -2416,8 +2434,10 @@ def apply(self, model):
                 scale_is_one = (scale == 1).all()
                 bias_is_zero = not np.any(bias)
                 if not (scale_is_one and (bias_is_zero or bias is not None)):
-                    warnings.warn(f"""{node.name}: Scale is not one or bias is not zero.
-                        Can't be converted to HWCustomOp. Please run ExtractNormScaleBias first.""")
+                    warnings.warn(
+                        f"""{node.name}: Scale is not one or bias is not zero.
+                        Can't be converted to HWCustomOp. Please run ExtractNormScaleBias first."""
+                    )
                     continue
                 act_in = node.input[0]
                 act_out = node.output[0]

From dd1e700599a8516b6946234d5c6a59dcfd9a70a4 Mon Sep 17 00:00:00 2001
From: ollycassidy13 <ollyj.cassidy@gmail.com>
Date: Mon, 27 Apr 2026 17:35:17 +0100
Subject: [PATCH 07/12] improved testing

---
 docs/finn/pwpolyf.md                          |  35 ++--
 .../pwpolyf/hdl/pwpolyf_template_wrapper.v    |  14 +-
 finn_xsi/finn_xsi/adapter.py                  |   2 +-
 src/finn/custom_op/fpgadataflow/pwpolyf.py    |  12 +-
 .../custom_op/fpgadataflow/rtl/pwpolyf_rtl.py |  14 +-
 .../fpgadataflow/convert_to_hw_layers.py      |   5 +-
 src/finn/util/pwpolyf.py                      |  35 ++--
 .../fpgadataflow/test_fpgadataflow_pwpolyf.py | 187 +++++++++++++++++-
 8 files changed, 249 insertions(+), 55 deletions(-)

diff --git a/docs/finn/pwpolyf.md b/docs/finn/pwpolyf.md
index cd8510a7ef..11b86e88a4 100644
--- a/docs/finn/pwpolyf.md
+++ b/docs/finn/pwpolyf.md
@@ -16,11 +16,12 @@ K=3 this gives 81 segments. Segment selection reuses the FP32
 exponent/mantissa bit-fields directly, matching the RTL implementation.
 
 Polynomial coefficients are generated at HDL build time by
-`generate_coeffs_pkg()` in `pwpolyf_rtl.py`, which fits degree-2 polynomials
-to the reference PyTorch functions and writes `pwpolyf_pkg.sv` — a
-SystemVerilog package with one `func_cfg_t` struct per activation
-(clamping config + coefficient table). K can take any value; it defaults
-to 3 when inferred from standard ONNX ops.
+`generate_coeffs_pkg()` in `pwpolyf_rtl.py`, which fits polynomials of the
+configured degree to the reference PyTorch functions and writes
+`pwpolyf_pkg.sv` — a SystemVerilog package with one `func_cfg_t` struct per
+activation (clamping config + coefficient table). Both K and degree are
+configurable; they default to K=3 and degree=2 when inferred from standard
+ONNX ops.
 
 ## Architecture
 
@@ -74,18 +75,19 @@ Notes:
 ## Folding
 
 PWPolyF uses PE parallelism. `NumChannels % PE == 0` must hold.
-Each PE instantiates its own polynomial evaluation pipeline (2 DSPs).
+Each PE instantiates its own polynomial evaluation pipeline (`degree` DSPs).
 `SetFolding` handles PE selection automatically.
 
-| PE | DSPs | Approx LUTs | Cycles (per spatial position) |
-|----|------|-------------|-------------------------------|
-| 1  | 2    | 200         | NumChannels                   |
-| C  | 2C   | 200C        | 1                             |
+| PE | Degree | DSPs       | Approx LUTs      | Cycles (per spatial position) |
+|----|--------|------------|-------------------|-------------------------------|
+| 1  | 2      | 2          | 200               | NumChannels                   |
+| C  | 2      | 2C         | 200C              | 1                             |
+| 1  | 3      | 3          | 300               | NumChannels                   |
 
 ## Resource estimates
 
-- **DSP:** 2 per PE (two FP32 FMA stages)
-- **LUT:** ~200 per PE (segment address decode + control)
+- **DSP:** `degree * PE` (one FP32 FMA stage per polynomial degree per PE)
+- **LUT:** `~100 * degree * PE` (segment address decode + control)
 - **BRAM/URAM:** 0 (coefficients stored in LUT/registers)
 
 ## ONNX export
@@ -109,7 +111,8 @@ Attributes on the explicit PWPolyF ONNX node:
 | Attribute          | Type   | Description                              |
 |--------------------|--------|------------------------------------------|
 | `func`             | string | Activation function name                 |
-| `K`                | int    | Mantissa subdivision bits                |
+| `K`                | int    | Mantissa subdivision bits (default 3)    |
+| `degree`           | int    | Polynomial degree / FMA stages (default 2) |
 | `NumChannels`      | int    | Number of channels (last input dim)      |
 | `PE`               | int    | Processing elements                      |
 | `inputDataType`    | string | Input data type (FLOAT32)                |
@@ -159,6 +162,10 @@ Attributes on the explicit PWPolyF ONNX node:
 - **SiLU edge cases**: reversed Mul input order, multi-consumer Sigmoid
 - **Execution correctness**: standard ops produce same output as PiecewisePolyActivation
 - **SpecializeLayers**: verifies RTL specialization
-- **Resource estimates**: DSP/LUT/BRAM checks across PE values
+- **Resource estimates**: DSP/LUT/BRAM checks across PE and degree values
 - **Folded shapes**: input/output/stream width calculations
 - **Expected cycles**: cycle count estimation + analysis pass integration
+- **Coefficient package**: `generate_coeffs_pkg()` output validation for K and degree
+- **HDL generation** (Vivado): verifies `generate_hdl` produces correct files and package content
+- **RTL simulation** (Vivado, slow): node-by-node rtlsim with cycle count verification
+- **Stitched IP** (Vivado, slow): end-to-end stitched IP rtlsim
diff --git a/finn-rtllib/pwpolyf/hdl/pwpolyf_template_wrapper.v b/finn-rtllib/pwpolyf/hdl/pwpolyf_template_wrapper.v
index eecf2ac74d..9bbbaa0987 100644
--- a/finn-rtllib/pwpolyf/hdl/pwpolyf_template_wrapper.v
+++ b/finn-rtllib/pwpolyf/hdl/pwpolyf_template_wrapper.v
@@ -35,7 +35,7 @@ module $MODULE_NAME_AXI_WRAPPER$ #(
 	parameter  PE = $PE$,
 	parameter  FUNC = $FUNC$
 )(
-	(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF in0_V:out_V, ASSOCIATED_RESET ap_rst_n" *)
+	(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF in0_V:out0_V, ASSOCIATED_RESET ap_rst_n" *)
 	(* X_INTERFACE_INFO = "xilinx.com:signal:clock:1.0 ap_clk CLK" *)
 	input	ap_clk,
 	(* X_INTERFACE_PARAMETER = "POLARITY ACTIVE_LOW" *)
@@ -47,9 +47,9 @@ module $MODULE_NAME_AXI_WRAPPER$ #(
 	input [$IN_WIDTH$-1:0]	in0_V_TDATA,
 
 	//- AXI Stream - Output -------------
-	input	out_V_TREADY,
-	output	out_V_TVALID,
-	output [$OUT_WIDTH$-1:0]	out_V_TDATA
+	input	out0_V_TREADY,
+	output	out0_V_TVALID,
+	output [$OUT_WIDTH$-1:0]	out0_V_TDATA
 );
 
 	pwpolyf #(
@@ -61,9 +61,9 @@ module $MODULE_NAME_AXI_WRAPPER$ #(
 		.xdat(in0_V_TDATA),
 		.xvld(in0_V_TVALID),
 		.xrdy(in0_V_TREADY),
-		.ydat(out_V_TDATA),
-		.yvld(out_V_TVALID),
-		.yrdy(out_V_TREADY)
+		.ydat(out0_V_TDATA),
+		.yvld(out0_V_TVALID),
+		.yrdy(out0_V_TREADY)
 	);
 
 endmodule // $MODULE_NAME_AXI_WRAPPER$
diff --git a/finn_xsi/finn_xsi/adapter.py b/finn_xsi/finn_xsi/adapter.py
index 0b73787a60..a10d7bde9c 100644
--- a/finn_xsi/finn_xsi/adapter.py
+++ b/finn_xsi/finn_xsi/adapter.py
@@ -47,7 +47,7 @@ def compile_sim_obj(top_module_name, source_list, sim_out_dir, debug=False, beha
 
         # sort src list so that packages are loaded first
         # these packages must be compiled before modules that depend on them
-        pkg_patterns = ["swg_pkg", "mvu_pkg"]
+        pkg_patterns = ["swg_pkg", "mvu_pkg", "pwpolyf_pkg"]
         srcs_list = sorted(
             source_list, key=lambda s: (not any(pkg in s for pkg in pkg_patterns), s)
         )
diff --git a/src/finn/custom_op/fpgadataflow/pwpolyf.py b/src/finn/custom_op/fpgadataflow/pwpolyf.py
index d412e1669a..48b5f33fd9 100644
--- a/src/finn/custom_op/fpgadataflow/pwpolyf.py
+++ b/src/finn/custom_op/fpgadataflow/pwpolyf.py
@@ -59,6 +59,8 @@ def get_nodeattr_types(self):
             # FINN DataTypes for inputs, outputs (always FLOAT32)
             "inputDataType": ("s", True, ""),
             "outputDataType": ("s", True, ""),
+            # polynomial degree (number of FMA stages per PE)
+            "degree": ("i", False, 2),
             # number of input vectors, examples:
             # [1] is a single vector (like a FC layer with batch=1)
             # [4] is four vectors (like a FC layer with batch=4)
@@ -155,7 +157,8 @@ def get_exp_cycles(self):
 
     def lut_estimation(self):
         pe = self.get_nodeattr("PE")
-        return 200 * pe
+        degree = self.get_nodeattr("degree")
+        return 100 * degree * pe
 
     def bram_estimation(self):
         # coefficients stored in LUT ROM, not BRAM
@@ -165,9 +168,9 @@ def uram_estimation(self):
         return 0
 
     def dsp_estimation(self, fpgapart=None):
-        # two DSPFP32 FMA instances per PE (Horner evaluation)
         pe = self.get_nodeattr("PE")
-        return 2 * pe
+        degree = self.get_nodeattr("degree")
+        return degree * pe
 
     def execute_node(self, context, graph):
         node = self.onnx_node
@@ -181,7 +184,8 @@ def execute_node(self, context, graph):
 
         from finn.util.pwpolyf import PiecewisePolyActivation  # noqa: PLC0415
 
-        mod = PiecewisePolyActivation(func, K=K)
+        degree = self.get_nodeattr("degree")
+        mod = PiecewisePolyActivation(func, K=K, degree=degree)
         with torch.no_grad():
             x = torch.from_numpy(inp.astype(np.float32))
             y = mod(x)
diff --git a/src/finn/custom_op/fpgadataflow/rtl/pwpolyf_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/pwpolyf_rtl.py
index 6bd80dd0df..f9ee038214 100644
--- a/src/finn/custom_op/fpgadataflow/rtl/pwpolyf_rtl.py
+++ b/src/finn/custom_op/fpgadataflow/rtl/pwpolyf_rtl.py
@@ -40,13 +40,12 @@ def _float_to_hex(f):
     return "%08X" % struct.unpack("!I", struct.pack("!f", float(f)))[0]
 
 
-def generate_coeffs_pkg(K, num_samples=1000):
+def generate_coeffs_pkg(K, degree=2, num_samples=1000):
     """Generate the pwpolyf_pkg.sv package content for a given K value.
 
     Produces a SystemVerilog package with a func_cfg_t struct per activation
     function, containing clamping parameters and polynomial coefficients.
     """
-    degree = 2
     num_subs = 1 << K
     num_segs = 1 + 2 * NUM_OCTAVES * num_subs
 
@@ -72,7 +71,7 @@ def generate_coeffs_pkg(K, num_samples=1000):
 
     for func_name in SUPPORTED_FUNCS:
         cfg = CLAMP_CFG[func_name]
-        coeffs = _fit_coefficients(func_name, K, num_samples)
+        coeffs = _fit_coefficients(func_name, K, degree=degree, num_samples=num_samples)
         label = func_name.upper()
         neg_hex = _float_to_hex(cfg["neg_clamp"])
         pos_hex = _float_to_hex(cfg["pos_clamp"])
@@ -123,8 +122,8 @@ def generate_hdl(self, model, fpgapart, clk):
         self.set_nodeattr("gen_top_module", topname)
 
         code_gen_dict = {
-            "$MODULE_NAME_AXI_WRAPPER$": topname + "_axi_wrapper",
-            "$TOP_MODULE$": topname + "_axi_wrapper",
+            "$MODULE_NAME_AXI_WRAPPER$": topname,
+            "$TOP_MODULE$": topname,
             "$PE$": str(pe),
             "$FUNC$": '"%s"' % func,
             "$IN_WIDTH$": str(pe * 32),
@@ -143,9 +142,10 @@ def generate_hdl(self, model, fpgapart, clk):
         for sv_file in ["pwpolyf.sv", "queue.sv"]:
             shutil.copy(rtllib_dir + sv_file, code_gen_dir)
 
-        # generate package with coefficients matching the node's K value
+        # generate package with coefficients matching the node's K and degree
         K = self.get_nodeattr("K")
-        pkg_data = generate_coeffs_pkg(K)
+        degree = self.get_nodeattr("degree")
+        pkg_data = generate_coeffs_pkg(K, degree=degree)
         with open(os.path.join(code_gen_dir, "pwpolyf_pkg.sv"), "w") as f:
             f.write(pkg_data)
 
diff --git a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py
index 3f714d7ae7..abc5f68b5b 100644
--- a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py
+++ b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py
@@ -317,7 +317,9 @@ def _match_erf_gelu(self, model, erf_node):
         return (gelu_input, mul_x.output[0], nodes_to_remove)
 
     @staticmethod
-    def _make_pwpolyf_node(pwp_input, pwp_output, func, in_shape, idt, name, K=3):
+    def _make_pwpolyf_node(
+        pwp_input, pwp_output, func, in_shape, idt, name, K=3, degree=2
+    ):
         num_channels = in_shape[-1]
         return helper.make_node(
             "PWPolyF",
@@ -327,6 +329,7 @@ def _make_pwpolyf_node(pwp_input, pwp_output, func, in_shape, idt, name, K=3):
             backend="fpgadataflow",
             func=func,
             K=K,
+            degree=degree,
             NumChannels=num_channels,
             PE=1,
             inputDataType=idt.name,
diff --git a/src/finn/util/pwpolyf.py b/src/finn/util/pwpolyf.py
index da3f65e246..9fd82c570c 100644
--- a/src/finn/util/pwpolyf.py
+++ b/src/finn/util/pwpolyf.py
@@ -91,19 +91,19 @@ def _segment_boundaries(K):
     return bounds
 
 
-def _fit_coefficients(func_name, K, num_samples=1000):
-    """Fit degree-2 polynomials per segment.  Returns (NUM_SEGS, 3) tensor."""
+def _fit_coefficients(func_name, K, degree=2, num_samples=1000):
+    """Fit degree-N polynomials per segment.  Returns (NUM_SEGS, degree+1) tensor."""
     ref_fn = REFERENCE_FUNCS[func_name]
     bounds = _segment_boundaries(K)
     num_segs = len(bounds)
-    coeffs = np.zeros((num_segs, 3), dtype=np.float64)
+    coeffs = np.zeros((num_segs, degree + 1), dtype=np.float64)
 
     for seg, (lo, hi) in enumerate(bounds):
         xs = np.linspace(lo, hi, num_samples, dtype=np.float64)
         with torch.no_grad():
             ys = ref_fn(torch.from_numpy(xs).float()).numpy().astype(np.float64)
-        c = np.polynomial.polynomial.polyfit(xs, ys, deg=2)
-        coeffs[seg] = c[:3]
+        c = np.polynomial.polynomial.polyfit(xs, ys, deg=degree)
+        coeffs[seg] = c[: degree + 1]
 
     return torch.from_numpy(coeffs.astype(np.float32))
 
@@ -146,6 +146,7 @@ class PWPolyFFunction(torch.autograd.Function):
     def forward(ctx, x, coeffs, neg_clamp_val, pos_clamp_val, func, K):
         num_subs = 1 << K
         num_segs = 1 + 2 * NUM_OCTAVES * num_subs
+        degree = coeffs.shape[1] - 1
         pos_passthrough = CLAMP_CFG[func]["pos_passthrough"]
 
         orig_shape = x.shape
@@ -154,11 +155,10 @@ def forward(ctx, x, coeffs, neg_clamp_val, pos_clamp_val, func, K):
         seg_idx, is_neg_clamp, is_pos_clamp = _segment_index(x_flat, K, num_subs, num_segs)
 
         c = coeffs[seg_idx]
-        a0 = c[:, 0]
-        a1 = c[:, 1]
-        a2 = c[:, 2]
-
-        y = a0 + x_flat * (a1 + a2 * x_flat)
+        # Horner evaluation: y = c0 + x*(c1 + x*(c2 + ...))
+        y = c[:, degree]
+        for i in range(degree - 1, -1, -1):
+            y = c[:, i] + x_flat * y
 
         if pos_passthrough:
             pos_val = x_flat
@@ -183,18 +183,19 @@ class PiecewisePolyActivation(nn.Module):
     Emits a single PWPolyF custom op node during ONNX export.
     """
 
-    def __init__(self, func="gelu", K=3, fit_samples=1000):
+    def __init__(self, func="gelu", K=3, degree=2, fit_samples=1000):
         super().__init__()
         if func not in SUPPORTED_FUNCS:
             raise ValueError("Unsupported func=%r; choose from %s" % (func, SUPPORTED_FUNCS))
 
         self.func = func
         self.K = K
+        self.degree = degree
         self.num_subs = 1 << K
         self.num_segs = 1 + 2 * NUM_OCTAVES * self.num_subs
         self.pos_passthrough = CLAMP_CFG[func]["pos_passthrough"]
 
-        coeffs = _fit_coefficients(func, K, fit_samples)
+        coeffs = _fit_coefficients(func, K, degree=degree, num_samples=fit_samples)
         self.register_buffer("coeffs", coeffs)
 
         neg_cv = torch.tensor(CLAMP_CFG[func]["neg_clamp"], dtype=torch.float32)
@@ -221,12 +222,10 @@ def forward(self, x):
         )
 
         c = self.coeffs[seg_idx]
-        a0 = c[:, 0]
-        a1 = c[:, 1]
-        a2 = c[:, 2]
-
-        # Horner: y = a0 + x*(a1 + a2*x)
-        y = a0 + x_flat * (a1 + a2 * x_flat)
+        # Horner evaluation: y = c0 + x*(c1 + x*(c2 + ...))
+        y = c[:, self.degree]
+        for i in range(self.degree - 1, -1, -1):
+            y = c[:, i] + x_flat * y
 
         if self.pos_passthrough:
             pos_val = x_flat
diff --git a/tests/fpgadataflow/test_fpgadataflow_pwpolyf.py b/tests/fpgadataflow/test_fpgadataflow_pwpolyf.py
index a36117b90d..8e333ccd08 100644
--- a/tests/fpgadataflow/test_fpgadataflow_pwpolyf.py
+++ b/tests/fpgadataflow/test_fpgadataflow_pwpolyf.py
@@ -41,11 +41,19 @@
 
 import finn.core.onnx_exec as oxe
 from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
+from finn.custom_op.fpgadataflow.rtl.pwpolyf_rtl import generate_coeffs_pkg
 from finn.transformation.fpgadataflow.convert_to_hw_layers import InferPWPolyFLayer
+from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP
+from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
+from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
+from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
+from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
+from finn.transformation.fpgadataflow.set_fifo_depths import InsertAndSetFIFODepths
 from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers
 from finn.util.pwpolyf import PiecewisePolyActivation
 
 test_fpga_part = "xczu3eg-sbva484-1-e"
+target_clk_ns = 5
 
 
 def make_pwpolyf_modelwrapper(func, K, num_channels, num_input_vecs):
@@ -220,17 +228,19 @@ def test_pwpolyf_specialize_rtl(func):
 
 @pytest.mark.parametrize("func", ["gelu", "tanh"])
 @pytest.mark.parametrize("pe", [1, 2, 4])
+@pytest.mark.parametrize("degree", [2, 3])
 @pytest.mark.fpgadataflow
-def test_pwpolyf_resource_estimates(func, pe):
+def test_pwpolyf_resource_estimates(func, pe, degree):
     K = 3
     num_channels = 8
     model = make_pwpolyf_modelwrapper(func, K, num_channels, [1])
     node = model.graph.node[0]
     inst = getCustomOp(node)
     inst.set_nodeattr("PE", pe)
+    inst.set_nodeattr("degree", degree)
 
-    assert inst.dsp_estimation() == 2 * pe
-    assert inst.lut_estimation() == 200 * pe
+    assert inst.dsp_estimation() == degree * pe
+    assert inst.lut_estimation() == 100 * degree * pe
     assert inst.bram_estimation() == 0
     assert inst.uram_estimation() == 0
 
@@ -563,3 +573,174 @@ def test_pwpolyf_erf_gelu_execution():
     with torch.no_grad():
         y_expected = ref_mod(torch.from_numpy(x)).numpy()
     assert np.allclose(y_produced, y_expected, atol=1e-6)
+
+
+# ---------- coefficient package smoketests ----------
+
+
+@pytest.mark.parametrize("K", [2, 3, 4])
+@pytest.mark.fpgadataflow
+def test_pwpolyf_generate_coeffs_pkg(K):
+    """Verify generate_coeffs_pkg produces valid SystemVerilog package."""
+    pkg = generate_coeffs_pkg(K)
+
+    assert "package pwpolyf_pkg" in pkg
+    assert "endpackage" in pkg
+    # localparam lines use padded alignment in the generated SV
+    assert "DEGREE      = 2;" in pkg
+    assert "K           = %d;" % K in pkg
+
+    num_segs = 1 + 2 * 5 * (1 << K)
+    assert "NUM_SEGS    = %d;" % num_segs in pkg
+
+    for func_label in ["GELU", "SILU", "SIGMOID", "TANH"]:
+        assert func_label + " = '{" in pkg
+
+    seg_lines = [line for line in pkg.split("\n") if "// seg" in line]
+    # Each function has num_segs segments, 4 functions total
+    assert len(seg_lines) == 4 * num_segs
+
+
+@pytest.mark.parametrize("degree", [1, 2, 3])
+@pytest.mark.fpgadataflow
+def test_pwpolyf_generate_coeffs_pkg_degree(degree):
+    """Verify generate_coeffs_pkg respects degree parameter."""
+    K = 3
+    pkg = generate_coeffs_pkg(K, degree=degree)
+
+    assert "DEGREE      = %d;" % degree in pkg
+    # Each segment line should have degree+1 coefficient values
+    seg_lines = [line for line in pkg.split("\n") if "// seg 0" in line]
+    for line in seg_lines:
+        hex_vals = [s for s in line.split() if s.startswith("32'h")]
+        assert len(hex_vals) == degree + 1
+
+
+# ---------- generate_hdl smoketests ----------
+
+
+@pytest.mark.parametrize("func", ["gelu", "tanh"])
+@pytest.mark.parametrize("pe", [1, 2])
+@pytest.mark.fpgadataflow
+@pytest.mark.vivado
+def test_pwpolyf_generate_hdl(func, pe):
+    """Verify generate_hdl produces expected RTL files."""
+    num_channels = 4
+    model = make_pwpolyf_modelwrapper(func, 3, num_channels, [1])
+    model = model.transform(SpecializeLayers(test_fpga_part))
+    model = model.transform(GiveUniqueNodeNames())
+
+    node = model.graph.node[0]
+    assert node.op_type == "PWPolyF_rtl"
+    inst = getCustomOp(node)
+    inst.set_nodeattr("PE", pe)
+
+    model = model.transform(PrepareIP(test_fpga_part, target_clk_ns))
+
+    # Re-fetch node after transform (PrepareIP returns a new model)
+    node = model.graph.node[0]
+    inst = getCustomOp(node)
+
+    code_gen_dir = inst.get_nodeattr("code_gen_dir_ipgen")
+    assert code_gen_dir, "code_gen_dir_ipgen not set after PrepareIP"
+    assert os.path.isfile(os.path.join(code_gen_dir, "pwpolyf_pkg.sv"))
+    assert os.path.isfile(os.path.join(code_gen_dir, "pwpolyf.sv"))
+    assert os.path.isfile(os.path.join(code_gen_dir, "queue.sv"))
+
+    topname = inst.get_nodeattr("gen_top_module")
+    assert os.path.isfile(os.path.join(code_gen_dir, topname + ".v"))
+
+    # Verify package content
+    with open(os.path.join(code_gen_dir, "pwpolyf_pkg.sv"), "r") as f:
+        pkg_content = f.read()
+    assert "DEGREE      = 2;" in pkg_content
+    assert "K           = 3;" in pkg_content
+    assert func.upper() + " = '{" in pkg_content
+
+
+# ---------- RTL simulation tests ----------
+
+
+@pytest.mark.parametrize("func", ["gelu", "sigmoid"])
+@pytest.mark.parametrize("num_channels", [4, 8])
+@pytest.mark.parametrize("pe", [1, 2, 4])
+@pytest.mark.fpgadataflow
+@pytest.mark.vivado
+@pytest.mark.slow
+def test_pwpolyf_rtlsim(func, num_channels, pe):
+    """Node-by-node RTL simulation of PWPolyF_rtl."""
+    if num_channels % pe != 0:
+        pytest.skip("PE does not divide NumChannels")
+
+    K = 3
+    model = make_pwpolyf_modelwrapper(func, K, num_channels, [1])
+
+    # Get cppsim reference output
+    x = np.random.uniform(-5, 5, (1, num_channels)).astype(np.float32)
+    input_dict = {"inp": x}
+    y_ref = oxe.execute_onnx(model, input_dict)["outp"]
+
+    # Specialize to RTL and set PE
+    model = model.transform(SpecializeLayers(test_fpga_part))
+    model = model.transform(GiveUniqueNodeNames())
+    node = model.graph.node[0]
+    assert node.op_type == "PWPolyF_rtl"
+    inst = getCustomOp(node)
+    inst.set_nodeattr("PE", pe)
+
+    # RTL simulation pipeline
+    model = model.transform(SetExecMode("rtlsim"))
+    model = model.transform(PrepareIP(test_fpga_part, target_clk_ns))
+    model = model.transform(HLSSynthIP())
+    model = model.transform(PrepareRTLSim())
+
+    y_rtl = oxe.execute_onnx(model, input_dict)["outp"]
+    assert np.allclose(y_ref, y_rtl, atol=1e-4), (
+        "RTL output does not match cppsim reference"
+    )
+
+    # Verify cycle count (re-fetch node after transforms)
+    node = model.graph.node[0]
+    inst = getCustomOp(node)
+    cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim")
+    exp_cycles_dict = model.analysis(exp_cycles_per_layer)
+    exp_cycles = exp_cycles_dict[node.name]
+    assert np.isclose(exp_cycles, cycles_rtlsim, atol=15)
+    assert exp_cycles != 0
+
+
+@pytest.mark.parametrize("func", ["gelu", "sigmoid"])
+@pytest.mark.parametrize("pe", [1, 2])
+@pytest.mark.fpgadataflow
+@pytest.mark.vivado
+@pytest.mark.slow
+def test_pwpolyf_rtlsim_stitched_ip(func, pe):
+    """Stitched IP RTL simulation of PWPolyF_rtl."""
+    K = 3
+    num_channels = 4
+    model = make_pwpolyf_modelwrapper(func, K, num_channels, [1])
+
+    # Get cppsim reference output
+    x = np.random.uniform(-5, 5, (1, num_channels)).astype(np.float32)
+    input_dict = {model.graph.input[0].name: x}
+    y_ref = oxe.execute_onnx(model, input_dict)[model.graph.output[0].name]
+
+    # Specialize to RTL and set PE
+    model = model.transform(SpecializeLayers(test_fpga_part))
+    model = model.transform(GiveUniqueNodeNames())
+    node = model.graph.node[0]
+    inst = getCustomOp(node)
+    inst.set_nodeattr("PE", pe)
+
+    # Stitched IP pipeline
+    model = model.transform(InsertAndSetFIFODepths(test_fpga_part, target_clk_ns))
+    model = model.transform(PrepareIP(test_fpga_part, target_clk_ns))
+    model = model.transform(HLSSynthIP())
+    model = model.transform(CreateStitchedIP(test_fpga_part, target_clk_ns))
+    model.set_metadata_prop("exec_mode", "rtlsim")
+
+    input_dict = {model.graph.input[0].name: x}
+    y_rtl = oxe.execute_onnx(model, input_dict)[model.graph.output[0].name]
+    assert np.allclose(y_ref, y_rtl, atol=1e-4), (
+        "Stitched IP output does not match cppsim reference"
+    )

From c23097b92d43ed67371c4fd69bd20ee078bbb683 Mon Sep 17 00:00:00 2001
From: ollycassidy13 <ollyj.cassidy@gmail.com>
Date: Tue, 28 Apr 2026 13:28:12 +0100
Subject: [PATCH 08/12] versal check

---
 docs/finn/pwpolyf.md                          | 26 +++++++++++--------
 src/finn/custom_op/fpgadataflow/pwpolyf.py    | 26 ++++++++++++++++---
 .../fpgadataflow/specialize_layers.py         | 26 +++++++++++++++++++
 .../fpgadataflow/test_fpgadataflow_pwpolyf.py | 19 ++++++++++----
 4 files changed, 78 insertions(+), 19 deletions(-)

diff --git a/docs/finn/pwpolyf.md b/docs/finn/pwpolyf.md
index 11b86e88a4..0fd89e5738 100644
--- a/docs/finn/pwpolyf.md
+++ b/docs/finn/pwpolyf.md
@@ -5,10 +5,10 @@
 PWPolyF is a hardware activation layer that approximates nonlinear functions
 (GELU, SiLU, Sigmoid, Tanh) using piecewise polynomials evaluated via Horner's
 method on a chain of DSPFP32 FMA units. With the default degree 2, this uses
-two cascaded DSPs per PE, giving single-cycle-per-element throughput with no
-BRAM usage. Per-function configuration (clamping behaviour and polynomial
-coefficients) is delivered through a SystemVerilog package (`pwpolyf_pkg`)
-using a `func_cfg_t` struct.
+two cascaded DSPs and one RAMB18 coefficient ROM per PE, giving
+single-cycle-per-element throughput. Per-function configuration (clamping
+behaviour and polynomial coefficients) is delivered through a SystemVerilog
+package (`pwpolyf_pkg`) using a `func_cfg_t` struct.
 
 The input domain is partitioned into `1 + 2*5*(2^K)` segments: one near-zero
 region, positive octave sub-segments, and negative mirrors. With the default
@@ -25,7 +25,9 @@ ONNX ops.
 
 ## Architecture
 
-PWPolyF is **RTL-only** (no HLS variant). Two export paths are supported:
+PWPolyF is **RTL-only** (no HLS variant) and targets Versal devices only,
+since the RTL instantiates the Versal DSPFP32 primitive. Two export paths are
+supported:
 
 ```
 Path A: PiecewisePolyActivation        Path B: nn.GELU / nn.SiLU / etc.
@@ -78,17 +80,19 @@ PWPolyF uses PE parallelism. `NumChannels % PE == 0` must hold.
 Each PE instantiates its own polynomial evaluation pipeline (`degree` DSPs).
 `SetFolding` handles PE selection automatically.
 
-| PE | Degree | DSPs       | Approx LUTs      | Cycles (per spatial position) |
-|----|--------|------------|-------------------|-------------------------------|
-| 1  | 2      | 2          | 200               | NumChannels                   |
-| C  | 2      | 2C         | 200C              | 1                             |
-| 1  | 3      | 3          | 300               | NumChannels                   |
+| PE | Degree | DSPs       | BRAM18s           | Approx LUTs      | Cycles (per spatial position) |
+|----|--------|------------|-------------------|------------------|-------------------------------|
+| 1  | 2      | 2          | 1                 | 200              | NumChannels                   |
+| C  | 2      | 2C         | C                 | 200C             | 1                             |
+| 1  | 3      | 3          | 2                 | 300              | NumChannels                   |
 
 ## Resource estimates
 
 - **DSP:** `degree * PE` (one FP32 FMA stage per polynomial degree per PE)
 - **LUT:** `~100 * degree * PE` (segment address decode + control)
-- **BRAM/URAM:** 0 (coefficients stored in LUT/registers)
+- **BRAM18:** `(degree - 1) * PE` for default `K=3` (Vivado infers delayed
+  coefficient lookups as 32-bit ROMs)
+- **URAM:** 0
 
 ## ONNX export
 
diff --git a/src/finn/custom_op/fpgadataflow/pwpolyf.py b/src/finn/custom_op/fpgadataflow/pwpolyf.py
index 48b5f33fd9..a9143984d5 100644
--- a/src/finn/custom_op/fpgadataflow/pwpolyf.py
+++ b/src/finn/custom_op/fpgadataflow/pwpolyf.py
@@ -26,6 +26,8 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+import math
+
 import numpy as np
 from qonnx.core.datatype import DataType
 
@@ -40,7 +42,7 @@ class PWPolyF(HWCustomOp):
     """
     HW op for piecewise polynomial activations (GELU, SiLU, Sigmoid, Tanh).
 
-    Element-wise FP32, coefficients baked into RTL.  No weights or BRAM.
+    Element-wise FP32, coefficients baked into RTL.  No weights.
     """
 
     def __init__(self, onnx_node, **kwargs):
@@ -161,8 +163,26 @@ def lut_estimation(self):
         return 100 * degree * pe
 
     def bram_estimation(self):
-        # coefficients stored in LUT ROM, not BRAM
-        return 0
+        pe = self.get_nodeattr("PE")
+        degree = self.get_nodeattr("degree")
+        num_segs = self.get_num_segments()
+
+        if degree <= 1:
+            return 0
+
+        # Stages after the first use a registered dynamic coefficient lookup
+        # for the DSP C input. Vivado infers this as one 32-bit wide ROM per
+        # stage and PE, backed by RAMB18 for the default K=3 table depth.
+        coeff_width = 32
+        if coeff_width <= 18 or num_segs > 512:
+            bram18_per_coeff_rom = math.ceil(num_segs / 1024) * math.ceil(
+                coeff_width / 18
+            )
+        else:
+            bram18_per_coeff_rom = math.ceil(num_segs / 512) * math.ceil(
+                coeff_width / 36
+            )
+        return pe * (degree - 1) * bram18_per_coeff_rom
 
     def uram_estimation(self):
         return 0
diff --git a/src/finn/transformation/fpgadataflow/specialize_layers.py b/src/finn/transformation/fpgadataflow/specialize_layers.py
index dcd2472e0a..5c0dd3a0cb 100644
--- a/src/finn/transformation/fpgadataflow/specialize_layers.py
+++ b/src/finn/transformation/fpgadataflow/specialize_layers.py
@@ -82,6 +82,11 @@ def _determine_impl_style(node, fpgapart, model):
                     return "rtl"
                 else:
                     return "hls"
+            elif optype == "PWPolyF":
+                if _pwpolyf_rtl_possible(node, fpgapart):
+                    return "rtl"
+                else:
+                    _raise_pwpolyf_unsupported(node, fpgapart)
             elif optype == "Requant":
                 if _requant_rtl_possible(node, fpgapart):
                     return "rtl"
@@ -106,6 +111,8 @@ def _determine_impl_style(node, fpgapart, model):
         if hls_variant:
             return "hls"
         elif rtl_variant:
+            if optype == "PWPolyF" and not _pwpolyf_rtl_possible(node, fpgapart):
+                _raise_pwpolyf_unsupported(node, fpgapart)
             warn_str = """There is no HLS variant of %s. Node %s will automatically be
                         set to RTL variant.""" % (
                 node.op_type,
@@ -158,6 +165,11 @@ def _determine_impl_style(node, fpgapart, model):
                 warnings.warn(warn_str)
                 return "hls"
 
+        elif optype == "PWPolyF":
+            if _pwpolyf_rtl_possible(node, fpgapart):
+                return "rtl"
+            else:
+                _raise_pwpolyf_unsupported(node, fpgapart)
         elif optype == "LayerNorm":
             if _layernorm_rtl_possible(node, fpgapart):
                 return "rtl"
@@ -346,6 +358,20 @@ def _layernorm_rtl_possible(n, fpgapart):
         return True
 
 
+def _pwpolyf_rtl_possible(n, fpgapart):
+    # PWPolyF uses the Versal DSPFP32 primitive.
+    return is_versal(fpgapart)
+
+
+def _raise_pwpolyf_unsupported(n, fpgapart):
+    raise Exception(
+        """PWPolyF node %s cannot be specialized for FPGA part %s.
+        PWPolyF_rtl uses the Versal DSPFP32 primitive and is only supported
+        on Versal devices."""
+        % (n.name, fpgapart)
+    )
+
+
 def _requant_rtl_possible(n, fpgapart):
     # Checks whether RTL-based Requant is supported
     # RTL Requant requires:
diff --git a/tests/fpgadataflow/test_fpgadataflow_pwpolyf.py b/tests/fpgadataflow/test_fpgadataflow_pwpolyf.py
index 8e333ccd08..2b4f1e119e 100644
--- a/tests/fpgadataflow/test_fpgadataflow_pwpolyf.py
+++ b/tests/fpgadataflow/test_fpgadataflow_pwpolyf.py
@@ -52,7 +52,8 @@
 from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers
 from finn.util.pwpolyf import PiecewisePolyActivation
 
-test_fpga_part = "xczu3eg-sbva484-1-e"
+test_fpga_part = "xcve2002-sbva484-2MP-e-S"
+non_versal_fpga_part = "xczu3eg-sbva484-1-e"
 target_clk_ns = 5
 
 
@@ -226,12 +227,20 @@ def test_pwpolyf_specialize_rtl(func):
     assert inst.get_nodeattr("K") == K
 
 
+@pytest.mark.fpgadataflow
+def test_pwpolyf_specialize_rejects_non_versal():
+    model = make_pwpolyf_modelwrapper("gelu", 3, 8, [1])
+
+    with pytest.raises(Exception, match="Versal"):
+        model.transform(SpecializeLayers(non_versal_fpga_part))
+
+
 @pytest.mark.parametrize("func", ["gelu", "tanh"])
 @pytest.mark.parametrize("pe", [1, 2, 4])
-@pytest.mark.parametrize("degree", [2, 3])
+@pytest.mark.parametrize("degree", [1, 2, 3])
+@pytest.mark.parametrize("K, bram18_per_coeff_rom", [(3, 1), (6, 2)])
 @pytest.mark.fpgadataflow
-def test_pwpolyf_resource_estimates(func, pe, degree):
-    K = 3
+def test_pwpolyf_resource_estimates(func, pe, degree, K, bram18_per_coeff_rom):
     num_channels = 8
     model = make_pwpolyf_modelwrapper(func, K, num_channels, [1])
     node = model.graph.node[0]
@@ -241,7 +250,7 @@ def test_pwpolyf_resource_estimates(func, pe, degree):
 
     assert inst.dsp_estimation() == degree * pe
     assert inst.lut_estimation() == 100 * degree * pe
-    assert inst.bram_estimation() == 0
+    assert inst.bram_estimation() == max(degree - 1, 0) * pe * bram18_per_coeff_rom
     assert inst.uram_estimation() == 0
 
 

From 7d56e906a7eb1859df779133543a5f0a162e6f5f Mon Sep 17 00:00:00 2001
From: ollycassidy13 <ollyj.cassidy@gmail.com>
Date: Wed, 29 Apr 2026 09:59:16 +0100
Subject: [PATCH 09/12] linting

---
 src/finn/custom_op/fpgadataflow/pwpolyf.py             |  9 ++-------
 .../fpgadataflow/convert_to_hw_layers.py               |  4 +---
 tests/fpgadataflow/test_fpgadataflow_pwpolyf.py        | 10 ++++------
 3 files changed, 7 insertions(+), 16 deletions(-)

diff --git a/src/finn/custom_op/fpgadataflow/pwpolyf.py b/src/finn/custom_op/fpgadataflow/pwpolyf.py
index a9143984d5..206fd3690c 100644
--- a/src/finn/custom_op/fpgadataflow/pwpolyf.py
+++ b/src/finn/custom_op/fpgadataflow/pwpolyf.py
@@ -27,7 +27,6 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import math
-
 import numpy as np
 from qonnx.core.datatype import DataType
 
@@ -175,13 +174,9 @@ def bram_estimation(self):
         # stage and PE, backed by RAMB18 for the default K=3 table depth.
         coeff_width = 32
         if coeff_width <= 18 or num_segs > 512:
-            bram18_per_coeff_rom = math.ceil(num_segs / 1024) * math.ceil(
-                coeff_width / 18
-            )
+            bram18_per_coeff_rom = math.ceil(num_segs / 1024) * math.ceil(coeff_width / 18)
         else:
-            bram18_per_coeff_rom = math.ceil(num_segs / 512) * math.ceil(
-                coeff_width / 36
-            )
+            bram18_per_coeff_rom = math.ceil(num_segs / 512) * math.ceil(coeff_width / 36)
         return pe * (degree - 1) * bram18_per_coeff_rom
 
     def uram_estimation(self):
diff --git a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py
index abc5f68b5b..dc09b3daeb 100644
--- a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py
+++ b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py
@@ -317,9 +317,7 @@ def _match_erf_gelu(self, model, erf_node):
         return (gelu_input, mul_x.output[0], nodes_to_remove)
 
     @staticmethod
-    def _make_pwpolyf_node(
-        pwp_input, pwp_output, func, in_shape, idt, name, K=3, degree=2
-    ):
+    def _make_pwpolyf_node(pwp_input, pwp_output, func, in_shape, idt, name, K=3, degree=2):
         num_channels = in_shape[-1]
         return helper.make_node(
             "PWPolyF",
diff --git a/tests/fpgadataflow/test_fpgadataflow_pwpolyf.py b/tests/fpgadataflow/test_fpgadataflow_pwpolyf.py
index 2b4f1e119e..4a1b656631 100644
--- a/tests/fpgadataflow/test_fpgadataflow_pwpolyf.py
+++ b/tests/fpgadataflow/test_fpgadataflow_pwpolyf.py
@@ -704,9 +704,7 @@ def test_pwpolyf_rtlsim(func, num_channels, pe):
     model = model.transform(PrepareRTLSim())
 
     y_rtl = oxe.execute_onnx(model, input_dict)["outp"]
-    assert np.allclose(y_ref, y_rtl, atol=1e-4), (
-        "RTL output does not match cppsim reference"
-    )
+    assert np.allclose(y_ref, y_rtl, atol=1e-4), "RTL output does not match cppsim reference"
 
     # Verify cycle count (re-fetch node after transforms)
     node = model.graph.node[0]
@@ -750,6 +748,6 @@ def test_pwpolyf_rtlsim_stitched_ip(func, pe):
 
     input_dict = {model.graph.input[0].name: x}
     y_rtl = oxe.execute_onnx(model, input_dict)[model.graph.output[0].name]
-    assert np.allclose(y_ref, y_rtl, atol=1e-4), (
-        "Stitched IP output does not match cppsim reference"
-    )
+    assert np.allclose(
+        y_ref, y_rtl, atol=1e-4
+    ), "Stitched IP output does not match cppsim reference"

From 1b6692d5c75d0febbc0205f40037462b99a695cc Mon Sep 17 00:00:00 2001
From: ollycassidy13 <ollyj.cassidy@gmail.com>
Date: Mon, 18 May 2026 15:29:16 +0100
Subject: [PATCH 10/12] move pwpolyf torch module

---
 docs/finn/pwpolyf.md                          |   3 +-
 docs/finn/source_code/finn.util.rst           |   9 +
 src/finn/custom_op/fpgadataflow/pwpolyf.py    |   2 +-
 .../custom_op/fpgadataflow/rtl/pwpolyf_rtl.py |   7 +-
 src/finn/util/pwpolyf.py                      | 239 +++---------------
 src/finn/util/torch_hw_modules.py             | 236 +++++++++++++++++
 .../fpgadataflow/test_fpgadataflow_pwpolyf.py |   2 +-
 7 files changed, 288 insertions(+), 210 deletions(-)
 create mode 100644 src/finn/util/torch_hw_modules.py

diff --git a/docs/finn/pwpolyf.md b/docs/finn/pwpolyf.md
index 0fd89e5738..cf3aaeeeb5 100644
--- a/docs/finn/pwpolyf.md
+++ b/docs/finn/pwpolyf.md
@@ -138,9 +138,10 @@ Attributes on the explicit PWPolyF ONNX node:
 
 | File | Purpose |
 |------|---------|
+| `util/torch_hw_modules.py` | PyTorch activation module, ONNX export, software simulation |
 | `custom_op/fpgadataflow/pwpolyf.py` | Base HW op (shape, folding, resource estimates, cppsim) |
 | `custom_op/fpgadataflow/rtl/pwpolyf_rtl.py` | RTL backend (HDL generation, package generation, rtlsim, IPI) |
-| `util/pwpolyf.py` | PyTorch activation module, ONNX export, software simulation |
+| `util/pwpolyf.py` | Compatibility imports for existing PWPolyF utility users |
 | `transformation/fpgadataflow/convert_to_hw_layers.py` | `InferPWPolyFLayer` transformation |
 | `builder/build_dataflow_steps.py` | Build pipeline integration |
 | `transformation/fpgadataflow/set_folding.py` | Folding support (pe_ops list) |
diff --git a/docs/finn/source_code/finn.util.rst b/docs/finn/source_code/finn.util.rst
index a06d55d81e..5ceebc2436 100644
--- a/docs/finn/source_code/finn.util.rst
+++ b/docs/finn/source_code/finn.util.rst
@@ -188,6 +188,15 @@ finn.util.pytorch
  :show-inheritance:
 
 
+finn.util.torch_hw_modules
+---------------------------
+
+.. automodule:: finn.util.torch_hw_modules
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+
 finn.util.pwpolyf
 -------------------
 
diff --git a/src/finn/custom_op/fpgadataflow/pwpolyf.py b/src/finn/custom_op/fpgadataflow/pwpolyf.py
index 206fd3690c..746ebdeb38 100644
--- a/src/finn/custom_op/fpgadataflow/pwpolyf.py
+++ b/src/finn/custom_op/fpgadataflow/pwpolyf.py
@@ -197,7 +197,7 @@ def execute_node(self, context, graph):
         # lazy import to avoid hard dependency on torch at module level
         import torch  # noqa: PLC0415
 
-        from finn.util.pwpolyf import PiecewisePolyActivation  # noqa: PLC0415
+        from finn.util.torch_hw_modules import PiecewisePolyActivation  # noqa: PLC0415
 
         degree = self.get_nodeattr("degree")
         mod = PiecewisePolyActivation(func, K=K, degree=degree)
diff --git a/src/finn/custom_op/fpgadataflow/rtl/pwpolyf_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/pwpolyf_rtl.py
index f9ee038214..3411c81a8b 100644
--- a/src/finn/custom_op/fpgadataflow/rtl/pwpolyf_rtl.py
+++ b/src/finn/custom_op/fpgadataflow/rtl/pwpolyf_rtl.py
@@ -32,7 +32,12 @@
 
 from finn.custom_op.fpgadataflow.pwpolyf import PWPolyF
 from finn.custom_op.fpgadataflow.rtlbackend import RTLBackend
-from finn.util.pwpolyf import CLAMP_CFG, NUM_OCTAVES, SUPPORTED_FUNCS, _fit_coefficients
+from finn.util.torch_hw_modules import (
+    CLAMP_CFG,
+    NUM_OCTAVES,
+    SUPPORTED_FUNCS,
+    _fit_coefficients,
+)
 
 
 def _float_to_hex(f):
diff --git a/src/finn/util/pwpolyf.py b/src/finn/util/pwpolyf.py
index 9fd82c570c..1972b0248a 100644
--- a/src/finn/util/pwpolyf.py
+++ b/src/finn/util/pwpolyf.py
@@ -26,212 +26,39 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-"""
-Piecewise polynomial activation - PyTorch module and software model.
+"""Compatibility imports for PWPolyF PyTorch utilities.
 
-Drop-in activation that approximates GELU, SiLU, Sigmoid, and Tanh using
-degree-2 polynomials, matching the pwpolyf RTL behaviour.  Emits a single
-PWPolyF custom op node during ONNX export (requires dynamo=False).
+The canonical home for PyTorch modules that match FINN hardware behavior is
+``finn.util.torch_hw_modules``. This module is kept to avoid breaking existing
+imports while downstream code moves to the new location.
 """
 
-import numpy as np
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-# Constants matching the SystemVerilog module
-NUM_OCTAVES = 5
-EXP_BIAS = 127
-EXP_BASE = 125
-EXP_CLAMP = 130
-
-SUPPORTED_FUNCS = ("gelu", "silu", "sigmoid", "tanh")
-
-REFERENCE_FUNCS = {
-    "gelu": lambda x: F.gelu(x),
-    "silu": lambda x: F.silu(x),
-    "sigmoid": lambda x: torch.sigmoid(x),
-    "tanh": lambda x: torch.tanh(x),
-}
-
-CLAMP_CFG = {
-    "gelu": {"neg_clamp": 0.0, "pos_clamp": 0.0, "pos_passthrough": True},
-    "silu": {"neg_clamp": 0.0, "pos_clamp": 0.0, "pos_passthrough": True},
-    "sigmoid": {"neg_clamp": 0.0, "pos_clamp": 1.0, "pos_passthrough": False},
-    "tanh": {"neg_clamp": -1.0, "pos_clamp": 1.0, "pos_passthrough": False},
-}
-
-
-def _segment_boundaries(K):
-    """Return (lo, hi) bounds for every segment."""
-    num_subs = 1 << K
-    bounds = []
-
-    # Segment 0: near-zero
-    bounds.append((-0.25, 0.25))
-
-    # Positive segments
-    for octave in range(NUM_OCTAVES):
-        exp_val = EXP_BASE + octave - EXP_BIAS
-        base = 2.0**exp_val
-        for sub in range(num_subs):
-            lo = base * (1.0 + sub / num_subs)
-            hi = base * (1.0 + (sub + 1) / num_subs)
-            bounds.append((lo, hi))
-
-    # Negative segments (mirror of positive)
-    for octave in range(NUM_OCTAVES):
-        exp_val = EXP_BASE + octave - EXP_BIAS
-        base = 2.0**exp_val
-        for sub in range(num_subs):
-            lo = base * (1.0 + sub / num_subs)
-            hi = base * (1.0 + (sub + 1) / num_subs)
-            bounds.append((-hi, -lo))
-
-    return bounds
-
-
-def _fit_coefficients(func_name, K, degree=2, num_samples=1000):
-    """Fit degree-N polynomials per segment.  Returns (NUM_SEGS, degree+1) tensor."""
-    ref_fn = REFERENCE_FUNCS[func_name]
-    bounds = _segment_boundaries(K)
-    num_segs = len(bounds)
-    coeffs = np.zeros((num_segs, degree + 1), dtype=np.float64)
-
-    for seg, (lo, hi) in enumerate(bounds):
-        xs = np.linspace(lo, hi, num_samples, dtype=np.float64)
-        with torch.no_grad():
-            ys = ref_fn(torch.from_numpy(xs).float()).numpy().astype(np.float64)
-        c = np.polynomial.polynomial.polyfit(xs, ys, deg=degree)
-        coeffs[seg] = c[: degree + 1]
-
-    return torch.from_numpy(coeffs.astype(np.float32))
-
-
-def _segment_index(x, K, num_subs, num_segs):
-    """Map each element to its polynomial segment, mirroring SV addressing."""
-    abs_x = x.abs()
-    is_neg = x < 0
-
-    is_near_zero = abs_x < 0.25
-    is_clamp = abs_x >= 8.0
-    is_neg_clamp = is_neg & is_clamp
-    is_pos_clamp = (~is_neg) & is_clamp
-
-    safe_abs = abs_x.clamp(min=0.25)
-    floor_log2 = torch.floor(torch.log2(safe_abs))
-    octave = (floor_log2 + 2).long().clamp(0, NUM_OCTAVES - 1)
-
-    pow2 = torch.exp2(floor_log2)
-    frac = safe_abs / pow2 - 1.0
-    sub = (frac * num_subs).long().clamp(0, num_subs - 1)
-
-    pos_idx = 1 + octave * num_subs + sub
-    neg_idx = 1 + NUM_OCTAVES * num_subs + octave * num_subs + sub
-
-    seg_idx = torch.where(
-        is_near_zero,
-        torch.zeros_like(pos_idx),
-        torch.where(is_neg, neg_idx, pos_idx),
-    )
-    seg_idx = seg_idx.clamp(0, num_segs - 1)
-
-    return seg_idx, is_neg_clamp, is_pos_clamp
-
-
-class PWPolyFFunction(torch.autograd.Function):
-    """Emits a single PWPolyF ONNX node during export."""
-
-    @staticmethod
-    def forward(ctx, x, coeffs, neg_clamp_val, pos_clamp_val, func, K):
-        num_subs = 1 << K
-        num_segs = 1 + 2 * NUM_OCTAVES * num_subs
-        degree = coeffs.shape[1] - 1
-        pos_passthrough = CLAMP_CFG[func]["pos_passthrough"]
-
-        orig_shape = x.shape
-        x_flat = x.contiguous().view(-1)
-
-        seg_idx, is_neg_clamp, is_pos_clamp = _segment_index(x_flat, K, num_subs, num_segs)
-
-        c = coeffs[seg_idx]
-        # Horner evaluation: y = c0 + x*(c1 + x*(c2 + ...))
-        y = c[:, degree]
-        for i in range(degree - 1, -1, -1):
-            y = c[:, i] + x_flat * y
-
-        if pos_passthrough:
-            pos_val = x_flat
-        else:
-            pos_val = pos_clamp_val.expand_as(y)
-        y = torch.where(is_pos_clamp, pos_val, y)
-        y = torch.where(is_neg_clamp, neg_clamp_val.expand_as(y), y)
-
-        return y.view(orig_shape)
-
-    @staticmethod
-    def symbolic(g, x, coeffs, neg_clamp_val, pos_clamp_val, func, K):
-        return g.op("PWPolyF", x, func_s=func, K_i=K)
-
-
-class PiecewisePolyActivation(nn.Module):
-    """
-    Drop-in activation matching the pwpolyf hardware behaviour.
-
-    Approximates nonlinear activations using degree-2 polynomials over
-    segments defined by FP32 bit-extraction.  Evaluated via Horner's method.
-    Emits a single PWPolyF custom op node during ONNX export.
-    """
-
-    def __init__(self, func="gelu", K=3, degree=2, fit_samples=1000):
-        super().__init__()
-        if func not in SUPPORTED_FUNCS:
-            raise ValueError("Unsupported func=%r; choose from %s" % (func, SUPPORTED_FUNCS))
-
-        self.func = func
-        self.K = K
-        self.degree = degree
-        self.num_subs = 1 << K
-        self.num_segs = 1 + 2 * NUM_OCTAVES * self.num_subs
-        self.pos_passthrough = CLAMP_CFG[func]["pos_passthrough"]
-
-        coeffs = _fit_coefficients(func, K, degree=degree, num_samples=fit_samples)
-        self.register_buffer("coeffs", coeffs)
-
-        neg_cv = torch.tensor(CLAMP_CFG[func]["neg_clamp"], dtype=torch.float32)
-        pos_cv = torch.tensor(CLAMP_CFG[func]["pos_clamp"], dtype=torch.float32)
-        self.register_buffer("neg_clamp_val", neg_cv)
-        self.register_buffer("pos_clamp_val", pos_cv)
-
-    def forward(self, x):
-        if torch.onnx.is_in_onnx_export():
-            return PWPolyFFunction.apply(
-                x,
-                self.coeffs,
-                self.neg_clamp_val,
-                self.pos_clamp_val,
-                self.func,
-                self.K,
-            )
-
-        orig_shape = x.shape
-        x_flat = x.contiguous().view(-1)
-
-        seg_idx, is_neg_clamp, is_pos_clamp = _segment_index(
-            x_flat, self.K, self.num_subs, self.num_segs
-        )
-
-        c = self.coeffs[seg_idx]
-        # Horner evaluation: y = c0 + x*(c1 + x*(c2 + ...))
-        y = c[:, self.degree]
-        for i in range(self.degree - 1, -1, -1):
-            y = c[:, i] + x_flat * y
-
-        if self.pos_passthrough:
-            pos_val = x_flat
-        else:
-            pos_val = self.pos_clamp_val.expand_as(y)
-        y = torch.where(is_pos_clamp, pos_val, y)
-        y = torch.where(is_neg_clamp, self.neg_clamp_val.expand_as(y), y)
-
-        return y.view(orig_shape)
+from finn.util.torch_hw_modules import (
+    CLAMP_CFG,
+    EXP_BASE,
+    EXP_BIAS,
+    EXP_CLAMP,
+    NUM_OCTAVES,
+    REFERENCE_FUNCS,
+    SUPPORTED_FUNCS,
+    PiecewisePolyActivation,
+    PWPolyFFunction,
+    _fit_coefficients,
+    _segment_boundaries,
+    _segment_index,
+)
+
+__all__ = [
+    "CLAMP_CFG",
+    "EXP_BIAS",
+    "EXP_BASE",
+    "EXP_CLAMP",
+    "NUM_OCTAVES",
+    "PWPolyFFunction",
+    "PiecewisePolyActivation",
+    "REFERENCE_FUNCS",
+    "SUPPORTED_FUNCS",
+    "_fit_coefficients",
+    "_segment_boundaries",
+    "_segment_index",
+]
diff --git a/src/finn/util/torch_hw_modules.py b/src/finn/util/torch_hw_modules.py
new file mode 100644
index 0000000000..3fc560c182
--- /dev/null
+++ b/src/finn/util/torch_hw_modules.py
@@ -0,0 +1,236 @@
+# Copyright (C) 2026, Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+"""
+PyTorch modules that match FINN hardware-layer behavior.
+
+These modules are intended as drop-in PyTorch layers for modelling the
+functional behavior of FINN hardware layers before conversion to HWCustomOps.
+"""
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+# Constants matching the SystemVerilog pwpolyf module
+NUM_OCTAVES = 5
+EXP_BIAS = 127
+EXP_BASE = 125
+EXP_CLAMP = 130
+
+SUPPORTED_FUNCS = ("gelu", "silu", "sigmoid", "tanh")
+
+REFERENCE_FUNCS = {
+    "gelu": lambda x: F.gelu(x),
+    "silu": lambda x: F.silu(x),
+    "sigmoid": lambda x: torch.sigmoid(x),
+    "tanh": lambda x: torch.tanh(x),
+}
+
+CLAMP_CFG = {
+    "gelu": {"neg_clamp": 0.0, "pos_clamp": 0.0, "pos_passthrough": True},
+    "silu": {"neg_clamp": 0.0, "pos_clamp": 0.0, "pos_passthrough": True},
+    "sigmoid": {"neg_clamp": 0.0, "pos_clamp": 1.0, "pos_passthrough": False},
+    "tanh": {"neg_clamp": -1.0, "pos_clamp": 1.0, "pos_passthrough": False},
+}
+
+
+def _segment_boundaries(K):
+    """Return (lo, hi) bounds for every PWPolyF segment."""
+    num_subs = 1 << K
+    bounds = []
+
+    # Segment 0: near-zero
+    bounds.append((-0.25, 0.25))
+
+    # Positive segments
+    for octave in range(NUM_OCTAVES):
+        exp_val = EXP_BASE + octave - EXP_BIAS
+        base = 2.0**exp_val
+        for sub in range(num_subs):
+            lo = base * (1.0 + sub / num_subs)
+            hi = base * (1.0 + (sub + 1) / num_subs)
+            bounds.append((lo, hi))
+
+    # Negative segments (mirror of positive)
+    for octave in range(NUM_OCTAVES):
+        exp_val = EXP_BASE + octave - EXP_BIAS
+        base = 2.0**exp_val
+        for sub in range(num_subs):
+            lo = base * (1.0 + sub / num_subs)
+            hi = base * (1.0 + (sub + 1) / num_subs)
+            bounds.append((-hi, -lo))
+
+    return bounds
+
+
+def _fit_coefficients(func_name, K, degree=2, num_samples=1000):
+    """Fit degree-N polynomials per segment. Returns a (segments, degree+1) tensor."""
+    ref_fn = REFERENCE_FUNCS[func_name]
+    bounds = _segment_boundaries(K)
+    num_segs = len(bounds)
+    coeffs = np.zeros((num_segs, degree + 1), dtype=np.float64)
+
+    for seg, (lo, hi) in enumerate(bounds):
+        xs = np.linspace(lo, hi, num_samples, dtype=np.float64)
+        with torch.no_grad():
+            ys = ref_fn(torch.from_numpy(xs).float()).numpy().astype(np.float64)
+        c = np.polynomial.polynomial.polyfit(xs, ys, deg=degree)
+        coeffs[seg] = c[: degree + 1]
+
+    return torch.from_numpy(coeffs.astype(np.float32))
+
+
+def _segment_index(x, K, num_subs, num_segs):
+    """Map each element to its polynomial segment, mirroring SV addressing."""
+    abs_x = x.abs()
+    is_neg = x < 0
+
+    is_near_zero = abs_x < 0.25
+    is_clamp = abs_x >= 8.0
+    is_neg_clamp = is_neg & is_clamp
+    is_pos_clamp = (~is_neg) & is_clamp
+
+    safe_abs = abs_x.clamp(min=0.25)
+    floor_log2 = torch.floor(torch.log2(safe_abs))
+    octave = (floor_log2 + 2).long().clamp(0, NUM_OCTAVES - 1)
+
+    pow2 = torch.exp2(floor_log2)
+    frac = safe_abs / pow2 - 1.0
+    sub = (frac * num_subs).long().clamp(0, num_subs - 1)
+
+    pos_idx = 1 + octave * num_subs + sub
+    neg_idx = 1 + NUM_OCTAVES * num_subs + octave * num_subs + sub
+
+    seg_idx = torch.where(
+        is_near_zero,
+        torch.zeros_like(pos_idx),
+        torch.where(is_neg, neg_idx, pos_idx),
+    )
+    seg_idx = seg_idx.clamp(0, num_segs - 1)
+
+    return seg_idx, is_neg_clamp, is_pos_clamp
+
+
+class PWPolyFFunction(torch.autograd.Function):
+    """Emit a single PWPolyF ONNX node during legacy torch.onnx export."""
+
+    @staticmethod
+    def forward(ctx, x, coeffs, neg_clamp_val, pos_clamp_val, func, K):
+        num_subs = 1 << K
+        num_segs = 1 + 2 * NUM_OCTAVES * num_subs
+        degree = coeffs.shape[1] - 1
+        pos_passthrough = CLAMP_CFG[func]["pos_passthrough"]
+
+        orig_shape = x.shape
+        x_flat = x.contiguous().view(-1)
+
+        seg_idx, is_neg_clamp, is_pos_clamp = _segment_index(x_flat, K, num_subs, num_segs)
+
+        c = coeffs[seg_idx]
+        # Horner evaluation: y = c0 + x*(c1 + x*(c2 + ...))
+        y = c[:, degree]
+        for i in range(degree - 1, -1, -1):
+            y = c[:, i] + x_flat * y
+
+        if pos_passthrough:
+            pos_val = x_flat
+        else:
+            pos_val = pos_clamp_val.expand_as(y)
+        y = torch.where(is_pos_clamp, pos_val, y)
+        y = torch.where(is_neg_clamp, neg_clamp_val.expand_as(y), y)
+
+        return y.view(orig_shape)
+
+    @staticmethod
+    def symbolic(g, x, coeffs, neg_clamp_val, pos_clamp_val, func, K):
+        return g.op("PWPolyF", x, func_s=func, K_i=K)
+
+
+class PiecewisePolyActivation(nn.Module):
+    """
+    Drop-in activation matching FINN's PWPolyF RTL behavior.
+
+    Approximates nonlinear activations using piecewise polynomials over
+    segments defined by FP32 bit extraction. The polynomial is evaluated via
+    Horner's method to match the DSPFP32 FMA chain used by the RTL.
+    """
+
+    def __init__(self, func="gelu", K=3, degree=2, fit_samples=1000):
+        super().__init__()
+        if func not in SUPPORTED_FUNCS:
+            raise ValueError("Unsupported func=%r; choose from %s" % (func, SUPPORTED_FUNCS))
+
+        self.func = func
+        self.K = K
+        self.degree = degree
+        self.num_subs = 1 << K
+        self.num_segs = 1 + 2 * NUM_OCTAVES * self.num_subs
+        self.pos_passthrough = CLAMP_CFG[func]["pos_passthrough"]
+
+        coeffs = _fit_coefficients(func, K, degree=degree, num_samples=fit_samples)
+        self.register_buffer("coeffs", coeffs)
+
+        neg_cv = torch.tensor(CLAMP_CFG[func]["neg_clamp"], dtype=torch.float32)
+        pos_cv = torch.tensor(CLAMP_CFG[func]["pos_clamp"], dtype=torch.float32)
+        self.register_buffer("neg_clamp_val", neg_cv)
+        self.register_buffer("pos_clamp_val", pos_cv)
+
+    def forward(self, x):
+        if torch.onnx.is_in_onnx_export():
+            return PWPolyFFunction.apply(
+                x,
+                self.coeffs,
+                self.neg_clamp_val,
+                self.pos_clamp_val,
+                self.func,
+                self.K,
+            )
+
+        orig_shape = x.shape
+        x_flat = x.contiguous().view(-1)
+
+        seg_idx, is_neg_clamp, is_pos_clamp = _segment_index(
+            x_flat, self.K, self.num_subs, self.num_segs
+        )
+
+        c = self.coeffs[seg_idx]
+        # Horner evaluation: y = c0 + x*(c1 + x*(c2 + ...))
+        y = c[:, self.degree]
+        for i in range(self.degree - 1, -1, -1):
+            y = c[:, i] + x_flat * y
+
+        if self.pos_passthrough:
+            pos_val = x_flat
+        else:
+            pos_val = self.pos_clamp_val.expand_as(y)
+        y = torch.where(is_pos_clamp, pos_val, y)
+        y = torch.where(is_neg_clamp, self.neg_clamp_val.expand_as(y), y)
+
+        return y.view(orig_shape)
diff --git a/tests/fpgadataflow/test_fpgadataflow_pwpolyf.py b/tests/fpgadataflow/test_fpgadataflow_pwpolyf.py
index 4a1b656631..f5d03ca82d 100644
--- a/tests/fpgadataflow/test_fpgadataflow_pwpolyf.py
+++ b/tests/fpgadataflow/test_fpgadataflow_pwpolyf.py
@@ -50,7 +50,7 @@
 from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
 from finn.transformation.fpgadataflow.set_fifo_depths import InsertAndSetFIFODepths
 from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers
-from finn.util.pwpolyf import PiecewisePolyActivation
+from finn.util.torch_hw_modules import PiecewisePolyActivation
 
 test_fpga_part = "xcve2002-sbva484-2MP-e-S"
 non_versal_fpga_part = "xczu3eg-sbva484-1-e"

From 59fc39870eed6afdf2b930d013f45e19bfbc23ab Mon Sep 17 00:00:00 2001
From: ollycassidy13 <ollyj.cassidy@gmail.com>
Date: Mon, 18 May 2026 16:34:17 +0100
Subject: [PATCH 11/12] Address PWPolyF reviewer comments

---
 docs/finn/components/index.rst                |   1 +
 docs/finn/components/pwpolyf.rst              | 272 ++++++++++++++++++
 docs/finn/pwpolyf.md                          | 176 ------------
 src/finn/custom_op/fpgadataflow/pwpolyf.py    |  49 +---
 .../custom_op/fpgadataflow/rtl/pwpolyf_rtl.py |  48 +---
 src/finn/util/pwpolyf.py                      |  29 +-
 src/finn/util/torch_hw_modules.py             |  29 +-
 .../fpgadataflow/test_fpgadataflow_pwpolyf.py |  48 +---
 8 files changed, 320 insertions(+), 332 deletions(-)
 create mode 100644 docs/finn/components/pwpolyf.rst
 delete mode 100644 docs/finn/pwpolyf.md

diff --git a/docs/finn/components/index.rst b/docs/finn/components/index.rst
index 9ab59297b1..7c8cdf1840 100644
--- a/docs/finn/components/index.rst
+++ b/docs/finn/components/index.rst
@@ -10,3 +10,4 @@ This section provides detailed documentation for specific FINN hardware componen
    :maxdepth: 2
 
    rtl-swg
+   pwpolyf
diff --git a/docs/finn/components/pwpolyf.rst b/docs/finn/components/pwpolyf.rst
new file mode 100644
index 0000000000..0259f35450
--- /dev/null
+++ b/docs/finn/components/pwpolyf.rst
@@ -0,0 +1,272 @@
+PWPolyF Piecewise Polynomial Activation
+=======================================
+
+Overview
+--------
+
+PWPolyF is a hardware activation layer that approximates nonlinear functions
+(GELU, SiLU, Sigmoid, Tanh) using piecewise polynomials evaluated with
+Horner's method on a chain of DSPFP32 FMA units. With the default degree of 2,
+this uses two cascaded DSPs and one RAMB18 coefficient ROM per PE, giving
+single-cycle-per-element throughput. Per-function configuration, including
+clamping behaviour and polynomial coefficients, is delivered through a
+SystemVerilog package (``pwpolyf_pkg``) using a ``func_cfg_t`` struct.
+
+The input domain is partitioned into ``1 + 2*5*(2^K)`` segments: one near-zero
+region, positive octave sub-segments, and negative mirrors. With the default
+``K=3`` this gives 81 segments. Segment selection reuses the FP32 exponent and
+mantissa bit fields directly, matching the RTL implementation.
+
+Polynomial coefficients are generated at HDL build time by
+``PWPolyF_rtl._generate_coeffs_pkg()``, which fits polynomials of the
+configured degree to the reference PyTorch functions and writes
+``pwpolyf_pkg.sv``. Both ``K`` and ``degree`` are configurable. They default to
+``K=3`` and ``degree=2`` when inferred from standard ONNX ops.
+
+Architecture
+------------
+
+PWPolyF is RTL-only, with no HLS variant, and targets Versal devices only. The
+RTL instantiates the Versal DSPFP32 primitive, so UltraScale+ and older parts
+must not be specialized to this backend.
+
+Two export paths are supported:
+
+.. code-block:: text
+
+   Path A: PiecewisePolyActivation        Path B: nn.GELU / nn.SiLU / etc.
+       |  torch.onnx.export                   |  torch.onnx.export
+       |  (dynamo=False)                      |  (dynamo=True or False)
+       v                                      v
+   PWPolyF custom ONNX node           Standard ONNX ops (Gelu, Sigmoid,
+       |                               Tanh, Sigmoid+Mul for SiLU,
+       |                               Div+Erf+Add+Mul+Mul for GELU)
+       |                                      |
+       +------------- both paths -------------+
+                         |
+                   InferPWPolyFLayer
+                         v
+               PWPolyF HW op (finn.custom_op.fpgadataflow)
+                         |  SpecializeLayers
+                         v
+               PWPolyF_rtl (finn.custom_op.fpgadataflow.rtl)
+                         |  generate_hdl
+                         v
+               finn-rtllib/pwpolyf/hdl/ SystemVerilog IP
+
+Standard ONNX Op Inference
+--------------------------
+
+``InferPWPolyFLayer`` recognises standard ONNX activation ops in addition to
+the explicit ``PWPolyF`` custom op. This allows models that use ``nn.GELU``,
+``nn.SiLU``, ``nn.Sigmoid``, or ``nn.Tanh`` to be exported with ``dynamo=True``
+or ``dynamo=False`` and automatically converted to PWPolyF HW layers.
+
+.. list-table::
+   :header-rows: 1
+   :widths: 20 45 20
+
+   * - ONNX op type
+     - Pattern
+     - Maps to
+   * - ``Gelu`` (opset 20+)
+     - Single node
+     - ``func="gelu"``
+   * - ``Div`` + ``Erf`` + ``Add`` + ``Mul`` + ``Mul``
+     - ``x * 0.5 * (1 + erf(x / sqrt(2)))``
+     - ``func="gelu"``
+   * - ``Sigmoid``
+     - Single node (standalone)
+     - ``func="sigmoid"``
+   * - ``Tanh``
+     - Single node
+     - ``func="tanh"``
+   * - ``Sigmoid`` + ``Mul``
+     - ``Mul(x, Sigmoid(x))``
+     - ``func="silu"``
+
+``Gelu`` as a single ONNX node requires opset 20 or later. With lower opsets,
+including ``dynamo=True`` export defaults to opset 18, GELU decomposes into a
+5-node Erf-based pattern. Both forms are matched. SiLU has no standard ONNX op
+and decomposes to ``Sigmoid(x) * x``. Only FLOAT32 inputs are converted.
+
+Folding
+-------
+
+PWPolyF uses PE parallelism. ``NumChannels % PE == 0`` must hold. Each PE
+instantiates its own polynomial evaluation pipeline with ``degree`` DSPs.
+``SetFolding`` handles PE selection automatically.
+
+.. list-table::
+   :header-rows: 1
+   :widths: 10 10 15 15 15 25
+
+   * - PE
+     - Degree
+     - DSPs
+     - BRAM18s
+     - Approx LUTs
+     - Cycles per spatial position
+   * - 1
+     - 2
+     - 2
+     - 1
+     - 200
+     - NumChannels
+   * - C
+     - 2
+     - 2C
+     - C
+     - 200C
+     - 1
+   * - 1
+     - 3
+     - 3
+     - 2
+     - 300
+     - NumChannels
+
+Resource Estimates
+------------------
+
+* DSP: ``degree * PE`` (one FP32 FMA stage per polynomial degree per PE)
+* LUT: approximately ``100 * degree * PE`` for segment address decode and
+  control
+* BRAM18: ``(degree - 1) * PE`` for default ``K=3``. Vivado infers delayed
+  coefficient lookups as 32-bit ROMs.
+* URAM: 0
+
+ONNX Export
+-----------
+
+Two export paths are supported:
+
+* ``PiecewisePolyActivation`` exports as a single ``PWPolyF`` custom op via
+  ``torch.autograd.Function.symbolic()``. It requires ``dynamo=False`` and
+  preserves the ``K`` attribute on the ONNX node.
+* Standard PyTorch modules (``nn.GELU``, ``nn.SiLU``, ``nn.Sigmoid``,
+  ``nn.Tanh``) export with ``dynamo=True`` or ``dynamo=False`` and produce
+  standard ONNX ops that ``InferPWPolyFLayer`` converts to PWPolyF with
+  default ``K=3``.
+
+Attributes on the explicit PWPolyF ONNX node are:
+
+* ``func``: one of ``gelu``, ``silu``, ``sigmoid``, ``tanh``
+* ``K``: mantissa subdivision bits, default 3
+
+Node Attributes
+---------------
+
+.. list-table::
+   :header-rows: 1
+   :widths: 25 15 45
+
+   * - Attribute
+     - Type
+     - Description
+   * - ``func``
+     - string
+     - Activation function name
+   * - ``K``
+     - int
+     - Mantissa subdivision bits, default 3
+   * - ``degree``
+     - int
+     - Polynomial degree / FMA stages, default 2
+   * - ``NumChannels``
+     - int
+     - Number of channels in the last input dimension
+   * - ``PE``
+     - int
+     - Processing elements
+   * - ``inputDataType``
+     - string
+     - Input data type, always FLOAT32
+   * - ``outputDataType``
+     - string
+     - Output data type, always FLOAT32
+   * - ``numInputVectors``
+     - ints
+     - Batch/spatial dimensions
+
+Supported Functions
+-------------------
+
+.. list-table::
+   :header-rows: 1
+   :widths: 20 20 30
+
+   * - Function
+     - Negative clamp
+     - Positive behaviour
+   * - GELU
+     - 0.0
+     - passthrough (``y=x``)
+   * - SiLU
+     - 0.0
+     - passthrough (``y=x``)
+   * - Sigmoid
+     - 0.0
+     - clamp to 1.0
+   * - Tanh
+     - -1.0
+     - clamp to 1.0
+
+Files
+-----
+
+Python files:
+
+.. list-table::
+   :header-rows: 1
+   :widths: 35 50
+
+   * - File
+     - Purpose
+   * - ``util/torch_hw_modules.py``
+     - PyTorch activation module, ONNX export, software simulation
+   * - ``custom_op/fpgadataflow/pwpolyf.py``
+     - Base HW op for shape, folding, resource estimates, cppsim
+   * - ``custom_op/fpgadataflow/rtl/pwpolyf_rtl.py``
+     - RTL backend for HDL generation, package generation, rtlsim, IPI
+   * - ``util/pwpolyf.py``
+     - Compatibility imports for existing PWPolyF utility users
+   * - ``transformation/fpgadataflow/convert_to_hw_layers.py``
+     - ``InferPWPolyFLayer`` transformation
+   * - ``builder/build_dataflow_steps.py``
+     - Build pipeline integration
+   * - ``transformation/fpgadataflow/set_folding.py``
+     - Folding support
+
+RTL files:
+
+.. list-table::
+   :header-rows: 1
+   :widths: 35 50
+
+   * - File
+     - Purpose
+   * - ``finn-rtllib/pwpolyf/hdl/pwpolyf_pkg.sv``
+     - ``func_cfg_t`` struct per activation, regenerated per K
+   * - ``finn-rtllib/pwpolyf/hdl/pwpolyf.sv``
+     - Polynomial evaluation pipeline using a Horner chain on DSPFP32
+   * - ``finn-rtllib/pwpolyf/hdl/queue.sv``
+     - Elastic FIFO for backpressure
+   * - ``finn-rtllib/pwpolyf/hdl/pwpolyf_template_wrapper.v``
+     - AXI-Stream wrapper template
+
+Tests
+-----
+
+``tests/fpgadataflow/test_fpgadataflow_pwpolyf.py`` covers:
+
+* cppsim for all supported functions, channel counts, spatial shapes, and
+  foldings
+* ONNX export for the explicit ``PiecewisePolyActivation`` path
+* ``InferPWPolyFLayer`` conversion and execution
+* standard op inference for Gelu, Sigmoid, Tanh, SiLU, and Erf-based GELU
+* execution correctness against ``PiecewisePolyActivation``
+* Versal-only specialization checks
+* resource estimates, folded shapes, and expected cycles
+* coefficient package generation for ``K`` and ``degree``
+* Vivado HDL generation, RTL simulation, and stitched IP simulation
diff --git a/docs/finn/pwpolyf.md b/docs/finn/pwpolyf.md
deleted file mode 100644
index cf3aaeeeb5..0000000000
--- a/docs/finn/pwpolyf.md
+++ /dev/null
@@ -1,176 +0,0 @@
-# PWPolyF — Piecewise Polynomial Activation
-
-## Overview
-
-PWPolyF is a hardware activation layer that approximates nonlinear functions
-(GELU, SiLU, Sigmoid, Tanh) using piecewise polynomials evaluated via Horner's
-method on a chain of DSPFP32 FMA units. With the default degree 2, this uses
-two cascaded DSPs and one RAMB18 coefficient ROM per PE, giving
-single-cycle-per-element throughput. Per-function configuration (clamping
-behaviour and polynomial coefficients) is delivered through a SystemVerilog
-package (`pwpolyf_pkg`) using a `func_cfg_t` struct.
-
-The input domain is partitioned into `1 + 2*5*(2^K)` segments: one near-zero
-region, positive octave sub-segments, and negative mirrors. With the default
-K=3 this gives 81 segments. Segment selection reuses the FP32
-exponent/mantissa bit-fields directly, matching the RTL implementation.
-
-Polynomial coefficients are generated at HDL build time by
-`generate_coeffs_pkg()` in `pwpolyf_rtl.py`, which fits polynomials of the
-configured degree to the reference PyTorch functions and writes
-`pwpolyf_pkg.sv` — a SystemVerilog package with one `func_cfg_t` struct per
-activation (clamping config + coefficient table). Both K and degree are
-configurable; they default to K=3 and degree=2 when inferred from standard
-ONNX ops.
-
-## Architecture
-
-PWPolyF is **RTL-only** (no HLS variant) and targets Versal devices only,
-since the RTL instantiates the Versal DSPFP32 primitive. Two export paths are
-supported:
-
-```
-Path A: PiecewisePolyActivation        Path B: nn.GELU / nn.SiLU / etc.
-    |  torch.onnx.export                   |  torch.onnx.export
-    |  (dynamo=False)                      |  (dynamo=True or False)
-    v                                      v
-PWPolyF custom ONNX node           Standard ONNX ops (Gelu, Sigmoid,
-    |                               Tanh, Sigmoid+Mul for SiLU,
-    |                               Div+Erf+Add+Mul+Mul for GELU)
-    |                                      |
-    +------------- both paths -------------+
-                      |
-                InferPWPolyFLayer
-                      v
-            PWPolyF HW op (finn.custom_op.fpgadataflow)
-                      |  SpecializeLayers
-                      v
-            PWPolyF_rtl (finn.custom_op.fpgadataflow.rtl)
-                      |  generate_hdl
-                      v
-            finn-rtllib/pwpolyf/hdl/ SystemVerilog IP
-```
-
-### Standard ONNX op inference
-
-`InferPWPolyFLayer` recognises standard ONNX activation ops in addition to
-the explicit `PWPolyF` custom op. This allows models that use `nn.GELU`,
-`nn.SiLU`, `nn.Sigmoid`, or `nn.Tanh` to be exported with `dynamo=True`
-(or `dynamo=False`) and automatically converted to PWPolyF HW layers.
-
-| ONNX op type | Pattern | Maps to |
-|---|---|---|
-| `Gelu` (opset 20+) | Single node | `func="gelu"` |
-| `Div`+`Erf`+`Add`+`Mul`+`Mul` | `x * 0.5 * (1 + erf(x / sqrt(2)))` | `func="gelu"` |
-| `Sigmoid` | Single node (standalone) | `func="sigmoid"` |
-| `Tanh` | Single node | `func="tanh"` |
-| `Sigmoid` + `Mul` | `Mul(x, Sigmoid(x))` | `func="silu"` |
-
-Notes:
-- `Gelu` as a single ONNX node requires opset 20 or later. With lower
-  opsets (including `dynamo=True` which defaults to opset 18), GELU
-  decomposes into a 5-node Erf-based pattern. Both forms are matched.
-- SiLU (`nn.SiLU`) has no standard ONNX op; it decomposes to
-  `Sigmoid(x) * x`. The transformation detects this two-node pattern.
-- Only FLOAT32 inputs are converted. Quantised activations are skipped.
-
-## Folding
-
-PWPolyF uses PE parallelism. `NumChannels % PE == 0` must hold.
-Each PE instantiates its own polynomial evaluation pipeline (`degree` DSPs).
-`SetFolding` handles PE selection automatically.
-
-| PE | Degree | DSPs       | BRAM18s           | Approx LUTs      | Cycles (per spatial position) |
-|----|--------|------------|-------------------|------------------|-------------------------------|
-| 1  | 2      | 2          | 1                 | 200              | NumChannels                   |
-| C  | 2      | 2C         | C                 | 200C             | 1                             |
-| 1  | 3      | 3          | 2                 | 300              | NumChannels                   |
-
-## Resource estimates
-
-- **DSP:** `degree * PE` (one FP32 FMA stage per polynomial degree per PE)
-- **LUT:** `~100 * degree * PE` (segment address decode + control)
-- **BRAM18:** `(degree - 1) * PE` for default `K=3` (Vivado infers delayed
-  coefficient lookups as 32-bit ROMs)
-- **URAM:** 0
-
-## ONNX export
-
-Two export paths are supported:
-
-1. **`PiecewisePolyActivation` (explicit)** — exports as a single `PWPolyF`
-   custom op via `torch.autograd.Function.symbolic()`. Requires
-   `dynamo=False`. Preserves the `K` attribute on the ONNX node.
-
-2. **Standard nn modules** (`nn.GELU`, `nn.SiLU`, `nn.Sigmoid`, `nn.Tanh`) —
-   export with `dynamo=True` or `dynamo=False`. Produces standard ONNX ops
-   that `InferPWPolyFLayer` converts to PWPolyF with default `K=3`.
-
-Attributes on the explicit PWPolyF ONNX node:
-- `func` (string): one of `gelu`, `silu`, `sigmoid`, `tanh`
-- `K` (int): mantissa subdivision bits (default 3)
-
-## Node attributes (HW op)
-
-| Attribute          | Type   | Description                              |
-|--------------------|--------|------------------------------------------|
-| `func`             | string | Activation function name                 |
-| `K`                | int    | Mantissa subdivision bits (default 3)    |
-| `degree`           | int    | Polynomial degree / FMA stages (default 2) |
-| `NumChannels`      | int    | Number of channels (last input dim)      |
-| `PE`               | int    | Processing elements                      |
-| `inputDataType`    | string | Input data type (FLOAT32)                |
-| `outputDataType`   | string | Output data type (FLOAT32)               |
-| `numInputVectors`  | ints   | Batch/spatial dimensions                 |
-
-## Supported functions
-
-| Function | Negative clamp | Positive behaviour |
-|----------|---------------|--------------------|
-| GELU     | 0.0           | passthrough (y=x)  |
-| SiLU     | 0.0           | passthrough (y=x)  |
-| Sigmoid  | 0.0           | clamp to 1.0       |
-| Tanh     | -1.0          | clamp to 1.0       |
-
-## Files
-
-### Python
-
-| File | Purpose |
-|------|---------|
-| `util/torch_hw_modules.py` | PyTorch activation module, ONNX export, software simulation |
-| `custom_op/fpgadataflow/pwpolyf.py` | Base HW op (shape, folding, resource estimates, cppsim) |
-| `custom_op/fpgadataflow/rtl/pwpolyf_rtl.py` | RTL backend (HDL generation, package generation, rtlsim, IPI) |
-| `util/pwpolyf.py` | Compatibility imports for existing PWPolyF utility users |
-| `transformation/fpgadataflow/convert_to_hw_layers.py` | `InferPWPolyFLayer` transformation |
-| `builder/build_dataflow_steps.py` | Build pipeline integration |
-| `transformation/fpgadataflow/set_folding.py` | Folding support (pe_ops list) |
-
-### RTL
-
-| File | Purpose |
-|------|---------|
-| `finn-rtllib/pwpolyf/hdl/pwpolyf_pkg.sv` | `func_cfg_t` struct per activation (coeffs + clamp config, regenerated per K) |
-| `finn-rtllib/pwpolyf/hdl/pwpolyf.sv` | Polynomial evaluation pipeline (Horner chain on DSPFP32) |
-| `finn-rtllib/pwpolyf/hdl/queue.sv` | Elastic FIFO for backpressure |
-| `finn-rtllib/pwpolyf/hdl/pwpolyf_template_wrapper.v` | AXI-Stream wrapper template |
-
-## Tests
-
-`tests/fpgadataflow/test_fpgadataflow_pwpolyf.py`:
-
-- **cppsim**: all 4 functions x 2 channel counts x 2 spatial shapes x 3 foldings
-- **ONNX export**: verifies single-node export for all functions
-- **InferPWPolyFLayer**: end-to-end export → transform → execute
-- **Standard op inference**: Gelu/Sigmoid/Tanh single-node + SiLU pattern
-- **Erf-based GELU inference**: 5-node Erf decomposition pattern matching + execution
-- **SiLU edge cases**: reversed Mul input order, multi-consumer Sigmoid
-- **Execution correctness**: standard ops produce same output as PiecewisePolyActivation
-- **SpecializeLayers**: verifies RTL specialization
-- **Resource estimates**: DSP/LUT/BRAM checks across PE and degree values
-- **Folded shapes**: input/output/stream width calculations
-- **Expected cycles**: cycle count estimation + analysis pass integration
-- **Coefficient package**: `generate_coeffs_pkg()` output validation for K and degree
-- **HDL generation** (Vivado): verifies `generate_hdl` produces correct files and package content
-- **RTL simulation** (Vivado, slow): node-by-node rtlsim with cycle count verification
-- **Stitched IP** (Vivado, slow): end-to-end stitched IP rtlsim
diff --git a/src/finn/custom_op/fpgadataflow/pwpolyf.py b/src/finn/custom_op/fpgadataflow/pwpolyf.py
index 746ebdeb38..b7a683499b 100644
--- a/src/finn/custom_op/fpgadataflow/pwpolyf.py
+++ b/src/finn/custom_op/fpgadataflow/pwpolyf.py
@@ -1,30 +1,5 @@
-# Copyright (C) 2026, Advanced Micro Devices, Inc.
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright notice, this
-#   list of conditions and the following disclaimer.
-#
-# * Redistributions in binary form must reproduce the above copyright notice,
-#   this list of conditions and the following disclaimer in the documentation
-#   and/or other materials provided with the distribution.
-#
-# * Neither the name of FINN nor the names of its
-#   contributors may be used to endorse or promote products derived from
-#   this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# Copyright Advanced Micro Devices, Inc.
+# SPDX-License-Identifier: BSD-3-Clause
 
 import math
 import numpy as np
@@ -32,7 +7,8 @@
 
 from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp
 
-# Piecewise polynomial constants matching the RTL module
+# NUM_OCTAVES is fixed by the RTL segment decode and clamp range. K controls
+# the number of mantissa subdivisions inside each of these fixed octaves.
 _NUM_OCTAVES = 5
 _SUPPORTED_FUNCS = {"gelu", "silu", "sigmoid", "tanh"}
 
@@ -82,10 +58,13 @@ def make_shape_compatible_op(self, model):
     def infer_node_datatype(self, model):
         node = self.onnx_node
         idt = model.get_tensor_datatype(node.input[0])
-        if idt != self.get_input_datatype():
-            self.set_nodeattr("inputDataType", idt.name)
-        odt = self.get_output_datatype()
-        model.set_tensor_datatype(node.output[0], odt)
+        assert idt == DataType["FLOAT32"], "%s: PWPolyF requires FLOAT32 input, got %s" % (
+            node.name,
+            idt,
+        )
+        self.set_nodeattr("inputDataType", idt.name)
+        self.set_nodeattr("outputDataType", idt.name)
+        model.set_tensor_datatype(node.output[0], idt)
 
     def verify_node(self):
         info_messages = []
@@ -114,6 +93,9 @@ def verify_node(self):
         idt = self.get_nodeattr("inputDataType")
         if idt != "FLOAT32":
             info_messages.append("PWPolyF requires FLOAT32 input, got %s" % idt)
+        odt = self.get_nodeattr("outputDataType")
+        if odt != "FLOAT32":
+            info_messages.append("PWPolyF requires FLOAT32 output, got %s" % odt)
 
         return info_messages
 
@@ -149,9 +131,6 @@ def get_normal_input_shape(self, ind=0):
     def get_normal_output_shape(self, ind=0):
         return self.get_normal_input_shape()
 
-    def get_number_output_values(self):
-        return np.prod(self.get_folded_output_shape()[:-1])
-
     def get_exp_cycles(self):
         # II=1, latency amortised over stream length
         return np.prod(self.get_folded_output_shape()[:-1])
diff --git a/src/finn/custom_op/fpgadataflow/rtl/pwpolyf_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/pwpolyf_rtl.py
index 3411c81a8b..5dfa730bc1 100644
--- a/src/finn/custom_op/fpgadataflow/rtl/pwpolyf_rtl.py
+++ b/src/finn/custom_op/fpgadataflow/rtl/pwpolyf_rtl.py
@@ -1,37 +1,14 @@
-# Copyright (C) 2026, Advanced Micro Devices, Inc.
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright notice, this
-#   list of conditions and the following disclaimer.
-#
-# * Redistributions in binary form must reproduce the above copyright notice,
-#   this list of conditions and the following disclaimer in the documentation
-#   and/or other materials provided with the distribution.
-#
-# * Neither the name of FINN nor the names of its
-#   contributors may be used to endorse or promote products derived from
-#   this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# Copyright Advanced Micro Devices, Inc.
+# SPDX-License-Identifier: BSD-3-Clause
 
+import numpy as np
 import os
 import shutil
-import struct
+from qonnx.core.datatype import DataType
 
 from finn.custom_op.fpgadataflow.pwpolyf import PWPolyF
 from finn.custom_op.fpgadataflow.rtlbackend import RTLBackend
+from finn.util.data_packing import array2hexstring
 from finn.util.torch_hw_modules import (
     CLAMP_CFG,
     NUM_OCTAVES,
@@ -42,10 +19,10 @@
 
 def _float_to_hex(f):
     """Convert a Python float to a 32-bit IEEE 754 hex string."""
-    return "%08X" % struct.unpack("!I", struct.pack("!f", float(f)))[0]
+    return array2hexstring(np.array([f]), DataType["FLOAT32"], 32, prefix="").upper()
 
 
-def generate_coeffs_pkg(K, degree=2, num_samples=1000):
+def _generate_coeffs_pkg_data(K, degree=2, num_samples=1000):
     """Generate the pwpolyf_pkg.sv package content for a given K value.
 
     Produces a SystemVerilog package with a func_cfg_t struct per activation
@@ -55,7 +32,7 @@ def generate_coeffs_pkg(K, degree=2, num_samples=1000):
     num_segs = 1 + 2 * NUM_OCTAVES * num_subs
 
     lines = []
-    lines.append("// Auto-generated by pwpolyf_rtl.py — do not edit manually.")
+    lines.append("// Auto-generated by pwpolyf_rtl.py - do not edit manually.")
     lines.append(
         "// DEGREE=%d K=%d NUM_OCTAVES=%d  Segments: %d" % (degree, K, NUM_OCTAVES, num_segs)
     )
@@ -115,6 +92,11 @@ def get_nodeattr_types(self):
         my_attrs.update(RTLBackend.get_nodeattr_types(self))
         return my_attrs
 
+    def _generate_coeffs_pkg(self, num_samples=1000):
+        K = self.get_nodeattr("K")
+        degree = self.get_nodeattr("degree")
+        return _generate_coeffs_pkg_data(K, degree=degree, num_samples=num_samples)
+
     def generate_hdl(self, model, fpgapart, clk):
         rtllib_dir = os.path.join(os.environ["FINN_ROOT"], "finn-rtllib/pwpolyf/hdl/")
         template_path = rtllib_dir + "pwpolyf_template_wrapper.v"
@@ -148,9 +130,7 @@ def generate_hdl(self, model, fpgapart, clk):
             shutil.copy(rtllib_dir + sv_file, code_gen_dir)
 
         # generate package with coefficients matching the node's K and degree
-        K = self.get_nodeattr("K")
-        degree = self.get_nodeattr("degree")
-        pkg_data = generate_coeffs_pkg(K, degree=degree)
+        pkg_data = self._generate_coeffs_pkg()
         with open(os.path.join(code_gen_dir, "pwpolyf_pkg.sv"), "w") as f:
             f.write(pkg_data)
 
diff --git a/src/finn/util/pwpolyf.py b/src/finn/util/pwpolyf.py
index 1972b0248a..0c426db05b 100644
--- a/src/finn/util/pwpolyf.py
+++ b/src/finn/util/pwpolyf.py
@@ -1,30 +1,5 @@
-# Copyright (C) 2026, Advanced Micro Devices, Inc.
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright notice, this
-#   list of conditions and the following disclaimer.
-#
-# * Redistributions in binary form must reproduce the above copyright notice,
-#   this list of conditions and the following disclaimer in the documentation
-#   and/or other materials provided with the distribution.
-#
-# * Neither the name of FINN nor the names of its
-#   contributors may be used to endorse or promote products derived from
-#   this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# Copyright Advanced Micro Devices, Inc.
+# SPDX-License-Identifier: BSD-3-Clause
 
 """Compatibility imports for PWPolyF PyTorch utilities.
 
diff --git a/src/finn/util/torch_hw_modules.py b/src/finn/util/torch_hw_modules.py
index 3fc560c182..b12ed05809 100644
--- a/src/finn/util/torch_hw_modules.py
+++ b/src/finn/util/torch_hw_modules.py
@@ -1,30 +1,5 @@
-# Copyright (C) 2026, Advanced Micro Devices, Inc.
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright notice, this
-#   list of conditions and the following disclaimer.
-#
-# * Redistributions in binary form must reproduce the above copyright notice,
-#   this list of conditions and the following disclaimer in the documentation
-#   and/or other materials provided with the distribution.
-#
-# * Neither the name of FINN nor the names of its
-#   contributors may be used to endorse or promote products derived from
-#   this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# Copyright Advanced Micro Devices, Inc.
+# SPDX-License-Identifier: BSD-3-Clause
 
 """
 PyTorch modules that match FINN hardware-layer behavior.
diff --git a/tests/fpgadataflow/test_fpgadataflow_pwpolyf.py b/tests/fpgadataflow/test_fpgadataflow_pwpolyf.py
index f5d03ca82d..4942b6e7ed 100644
--- a/tests/fpgadataflow/test_fpgadataflow_pwpolyf.py
+++ b/tests/fpgadataflow/test_fpgadataflow_pwpolyf.py
@@ -1,30 +1,5 @@
-# Copyright (C) 2026, Advanced Micro Devices, Inc.
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright notice, this
-#   list of conditions and the following disclaimer.
-#
-# * Redistributions in binary form must reproduce the above copyright notice,
-#   this list of conditions and the following disclaimer in the documentation
-#   and/or other materials provided with the distribution.
-#
-# * Neither the name of FINN nor the names of its
-#   contributors may be used to endorse or promote products derived from
-#   this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# Copyright Advanced Micro Devices, Inc.
+# SPDX-License-Identifier: BSD-3-Clause
 
 import pytest
 
@@ -41,7 +16,6 @@
 
 import finn.core.onnx_exec as oxe
 from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
-from finn.custom_op.fpgadataflow.rtl.pwpolyf_rtl import generate_coeffs_pkg
 from finn.transformation.fpgadataflow.convert_to_hw_layers import InferPWPolyFLayer
 from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP
 from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
@@ -52,7 +26,7 @@
 from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers
 from finn.util.torch_hw_modules import PiecewisePolyActivation
 
-test_fpga_part = "xcve2002-sbva484-2MP-e-S"
+test_fpga_part = "xcvc1902-vsva2197-2MP-e-S"
 non_versal_fpga_part = "xczu3eg-sbva484-1-e"
 target_clk_ns = 5
 
@@ -91,6 +65,14 @@ def make_pwpolyf_modelwrapper(func, K, num_channels, num_input_vecs):
     return model
 
 
+def make_pwpolyf_rtl_inst(K=3, degree=2):
+    model = make_pwpolyf_modelwrapper("gelu", K, 4, [1])
+    model = model.transform(SpecializeLayers(test_fpga_part))
+    inst = getCustomOp(model.graph.node[0])
+    inst.set_nodeattr("degree", degree)
+    return inst
+
+
 @pytest.mark.parametrize("func", ["gelu", "silu", "sigmoid", "tanh"])
 @pytest.mark.parametrize("num_channels", [4, 16])
 @pytest.mark.parametrize("num_input_vecs", [[1], [1, 2, 2]])
@@ -590,8 +572,8 @@ def test_pwpolyf_erf_gelu_execution():
 @pytest.mark.parametrize("K", [2, 3, 4])
 @pytest.mark.fpgadataflow
 def test_pwpolyf_generate_coeffs_pkg(K):
-    """Verify generate_coeffs_pkg produces valid SystemVerilog package."""
-    pkg = generate_coeffs_pkg(K)
+    """Verify PWPolyF_rtl coefficient generation produces valid SystemVerilog."""
+    pkg = make_pwpolyf_rtl_inst(K=K)._generate_coeffs_pkg()
 
     assert "package pwpolyf_pkg" in pkg
     assert "endpackage" in pkg
@@ -613,9 +595,9 @@ def test_pwpolyf_generate_coeffs_pkg(K):
 @pytest.mark.parametrize("degree", [1, 2, 3])
 @pytest.mark.fpgadataflow
 def test_pwpolyf_generate_coeffs_pkg_degree(degree):
-    """Verify generate_coeffs_pkg respects degree parameter."""
+    """Verify PWPolyF_rtl coefficient generation respects degree parameter."""
     K = 3
-    pkg = generate_coeffs_pkg(K, degree=degree)
+    pkg = make_pwpolyf_rtl_inst(K=K, degree=degree)._generate_coeffs_pkg()
 
     assert "DEGREE      = %d;" % degree in pkg
     # Each segment line should have degree+1 coefficient values

From f3156c4edfd84e37b6680cb2b30e71a174874585 Mon Sep 17 00:00:00 2001
From: ollycassidy13 <ollyj.cassidy@gmail.com>
Date: Wed, 20 May 2026 12:28:58 +0100
Subject: [PATCH 12/12] export to match brevitas

---
 .../fpgadataflow/convert_to_hw_layers.py               |  3 +++
 src/finn/util/torch_hw_modules.py                      |  9 +++++----
 tests/fpgadataflow/test_fpgadataflow_pwpolyf.py        | 10 +++++++---
 3 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py
index dc09b3daeb..185dc73e06 100644
--- a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py
+++ b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py
@@ -353,6 +353,8 @@ def apply(self, model):
                 func = get_by_name(node.attribute, "func").s.decode("utf-8")
                 K_attr = get_by_name(node.attribute, "K")
                 K = K_attr.i if K_attr is not None else 3
+                degree_attr = get_by_name(node.attribute, "degree")
+                degree = degree_attr.i if degree_attr is not None else 2
 
                 new_node = self._make_pwpolyf_node(
                     pwp_input,
@@ -362,6 +364,7 @@ def apply(self, model):
                     idt,
                     "PWPolyF_" + node.name,
                     K,
+                    degree,
                 )
                 graph.node.insert(node_ind, new_node)
                 graph.node.remove(node)
diff --git a/src/finn/util/torch_hw_modules.py b/src/finn/util/torch_hw_modules.py
index b12ed05809..d73ae16f0c 100644
--- a/src/finn/util/torch_hw_modules.py
+++ b/src/finn/util/torch_hw_modules.py
@@ -117,10 +117,10 @@ class PWPolyFFunction(torch.autograd.Function):
     """Emit a single PWPolyF ONNX node during legacy torch.onnx export."""
 
     @staticmethod
-    def forward(ctx, x, coeffs, neg_clamp_val, pos_clamp_val, func, K):
+    def forward(ctx, x, coeffs, neg_clamp_val, pos_clamp_val, func, K, degree):
         num_subs = 1 << K
         num_segs = 1 + 2 * NUM_OCTAVES * num_subs
-        degree = coeffs.shape[1] - 1
+        degree = int(degree)
         pos_passthrough = CLAMP_CFG[func]["pos_passthrough"]
 
         orig_shape = x.shape
@@ -144,8 +144,8 @@ def forward(ctx, x, coeffs, neg_clamp_val, pos_clamp_val, func, K):
         return y.view(orig_shape)
 
     @staticmethod
-    def symbolic(g, x, coeffs, neg_clamp_val, pos_clamp_val, func, K):
-        return g.op("PWPolyF", x, func_s=func, K_i=K)
+    def symbolic(g, x, coeffs, neg_clamp_val, pos_clamp_val, func, K, degree):
+        return g.op("PWPolyF", x, func_s=func, K_i=K, degree_i=degree)
 
 
 class PiecewisePolyActivation(nn.Module):
@@ -186,6 +186,7 @@ def forward(self, x):
                 self.pos_clamp_val,
                 self.func,
                 self.K,
+                self.degree,
             )
 
         orig_shape = x.shape
diff --git a/tests/fpgadataflow/test_fpgadataflow_pwpolyf.py b/tests/fpgadataflow/test_fpgadataflow_pwpolyf.py
index 4942b6e7ed..b9de975778 100644
--- a/tests/fpgadataflow/test_fpgadataflow_pwpolyf.py
+++ b/tests/fpgadataflow/test_fpgadataflow_pwpolyf.py
@@ -109,8 +109,9 @@ def test_pwpolyf_cppsim(func, num_channels, num_input_vecs, fold):
 @pytest.mark.fpgadataflow
 def test_pwpolyf_onnx_export(func):
     K = 3
+    degree = 3
     num_channels = 32
-    mod = PiecewisePolyActivation(func, K=K)
+    mod = PiecewisePolyActivation(func, K=K, degree=degree)
     mod.eval()
     dummy = torch.randn(1, num_channels)
 
@@ -138,14 +139,16 @@ def test_pwpolyf_onnx_export(func):
     func_attr = {a.name: a for a in node.attribute}
     assert func_attr["func"].s.decode("utf-8") == func
     assert func_attr["K"].i == K
+    assert func_attr["degree"].i == degree
 
 
 @pytest.mark.parametrize("func", ["gelu", "sigmoid"])
 @pytest.mark.fpgadataflow
 def test_pwpolyf_infer_transform(func):
     K = 3
+    degree = 3
     num_channels = 16
-    mod = PiecewisePolyActivation(func, K=K)
+    mod = PiecewisePolyActivation(func, K=K, degree=degree)
     mod.eval()
     dummy = torch.randn(1, num_channels)
 
@@ -178,6 +181,7 @@ def test_pwpolyf_infer_transform(func):
     inst = getCustomOp(node)
     assert inst.get_nodeattr("func") == func
     assert inst.get_nodeattr("K") == K
+    assert inst.get_nodeattr("degree") == degree
     assert inst.get_nodeattr("NumChannels") == num_channels
     assert inst.get_nodeattr("PE") == 1
     assert inst.get_nodeattr("inputDataType") == "FLOAT32"
@@ -186,7 +190,7 @@ def test_pwpolyf_infer_transform(func):
     input_dict = {"inp": x}
     y_produced = oxe.execute_onnx(model, input_dict)["outp"]
 
-    ref_mod = PiecewisePolyActivation(func, K=K)
+    ref_mod = PiecewisePolyActivation(func, K=K, degree=degree)
     with torch.no_grad():
         y_expected = ref_mod(torch.from_numpy(x)).numpy()
     assert np.allclose(y_produced, y_expected, atol=1e-6)