Skip to content

Commit a15e2ac

Browse files
authored
Merge pull request #1570 from Xilinx/feature/gen_rtleltwise
[Elementwise RTL] Adding elementwise RTL support for INT and UINT inputs
2 parents 17aec22 + 2b60540 commit a15e2ac

13 files changed

Lines changed: 762 additions & 358 deletions

File tree

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,5 @@
11
/****************************************************************************
2-
* Copyright (C) 2025, Advanced Micro Devices, Inc.
3-
* All rights reserved.
4-
*
2+
* Copyright Advanced Micro Devices, Inc.
53
* SPDX-License-Identifier: BSD-3-Clause
64
*
75
* @author Thomas B. Preußer <thomas.preusser@amd.com>

finn-rtllib/eltwise/binopi.sv

Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
/****************************************************************************
2+
* Copyright (C) 2026, Advanced Micro Devices, Inc.
3+
* All rights reserved.
4+
*
5+
* SPDX-License-Identifier: BSD-3-Clause
6+
*
7+
* @brief Integer binary operation: a OP b.
8+
* @author Shane Fleming <shane.fleming@amd.com>
9+
***************************************************************************/
10+
11+
module binopi #(
12+
parameter OP, // ADD(a+b), SUB(a-b), SBR(b-a), MUL(a*b)
13+
int unsigned WIDTH,
14+
bit SIGNED = 0,
15+
16+
localparam bit IS_MUL = (OP == "MUL"),
17+
localparam int unsigned O_WIDTH = IS_MUL? 2*WIDTH : WIDTH + 1
18+
)(
19+
input logic clk,
20+
input logic rst,
21+
22+
input logic [WIDTH-1:0] a,
23+
input logic avld,
24+
input logic [WIDTH-1:0] b,
25+
input logic bload,
26+
27+
output logic [O_WIDTH-1:0] r,
28+
output logic rvld
29+
);
30+
31+
localparam int unsigned LATENCY = IS_MUL? 3 : 1;
32+
33+
initial begin
34+
if(OP != "ADD" && OP != "SUB" && OP != "SBR" && OP != "MUL") begin
35+
$error("%m: Unsupported integer operation %s", OP);
36+
$finish;
37+
end
38+
end
39+
40+
//=== Valid Signalling ================================================
41+
logic [LATENCY-1:0] Vld = '0;
42+
always_ff @(posedge clk) begin
43+
if(rst) Vld <= '0;
44+
else Vld <= { Vld, avld };
45+
end
46+
assign rvld = Vld[$left(Vld)];
47+
48+
//=== Multiply Pipeline (DSP-inferable) ===============================
49+
// 3 stages: input regs (AREG), product reg (MREG), output reg (PREG).
50+
// Vivado retimes into a DSP58 INT MUL.
51+
if(IS_MUL) begin : genMul
52+
logic [ WIDTH-1:0] A1 = 'x;
53+
logic [ WIDTH-1:0] B1 = 'x;
54+
logic [O_WIDTH-1:0] M = 'x;
55+
logic [O_WIDTH-1:0] P = 'x;
56+
always_ff @(posedge clk) begin
57+
if(rst) begin
58+
A1 <= 'x; B1 <= 'x; M <= 'x; P <= 'x;
59+
end
60+
else begin
61+
A1 <= a;
62+
if(bload) B1 <= b;
63+
M <= SIGNED? O_WIDTH'($signed(A1) * $signed(B1)) : A1 * B1;
64+
P <= M;
65+
end
66+
end
67+
assign r = P;
68+
end : genMul
69+
else begin : genAddSub
70+
logic [O_WIDTH-1:0] P1 = 'x;
71+
always_ff @(posedge clk) begin
72+
if(rst) P1 <= 'x;
73+
else begin
74+
if(SIGNED) begin
75+
unique case(OP)
76+
"ADD": P1 <= O_WIDTH'($signed(a) + $signed(b));
77+
"SUB": P1 <= O_WIDTH'($signed(a) - $signed(b));
78+
"SBR": P1 <= O_WIDTH'($signed(b) - $signed(a));
79+
endcase
80+
end
81+
else begin
82+
unique case(OP)
83+
"ADD": P1 <= a + b;
84+
"SUB": P1 <= a - b;
85+
"SBR": P1 <= b - a;
86+
endcase
87+
end
88+
end
89+
end
90+
assign r = P1;
91+
end : genAddSub
92+
93+
endmodule : binopi

finn-rtllib/eltwise/eltwise.sv

Lines changed: 223 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,223 @@
1+
/****************************************************************************
2+
* Copyright Advanced Micro Devices, Inc.
3+
* SPDX-License-Identifier: BSD-3-Clause
4+
*
5+
* @brief Two-input elementwise stream operation (generalized).
6+
* Supports float/float, int/float, float/int, and int/int paths.
7+
* @author Thomas B. Preußer <thomas.preusser@amd.com>
8+
* @author Shane Fleming <shane.fleming@amd.com>
9+
***************************************************************************/
10+
11+
module eltwise #(
12+
parameter OP, // ADD(a+b), SUB(a-b), SBR(b-a), MUL(a*b)
13+
int unsigned PE = 1,
14+
shortreal B_SCALE = 1.0,
15+
bit FORCE_BEHAVIORAL = 0,
16+
17+
// Type selection: 1 = float32, 0 = integer
18+
bit A_FLOAT = 1,
19+
bit B_FLOAT = 1,
20+
21+
// Integer parameters (ignored when corresponding input is float)
22+
int unsigned A_WIDTH = 32,
23+
bit A_SIGNED = 0,
24+
int unsigned B_WIDTH = 32,
25+
bit B_SIGNED = 0,
26+
27+
// Port-width derivations (do not override)
28+
localparam int unsigned A_DAT_W = A_FLOAT? 32 : A_WIDTH,
29+
localparam int unsigned B_DAT_W = B_FLOAT? 32 : B_WIDTH,
30+
localparam bit BOTH_INT = !A_FLOAT && !B_FLOAT,
31+
localparam bit IS_MUL = (OP == "MUL"),
32+
localparam int unsigned INT_WIDTH = BOTH_INT? A_WIDTH : 0,
33+
localparam int unsigned O_WIDTH =
34+
BOTH_INT? (IS_MUL? 2*INT_WIDTH : INT_WIDTH + 1) : 32
35+
)(
36+
input logic clk,
37+
input logic rst,
38+
39+
input logic [PE-1:0][A_DAT_W-1:0] adat,
40+
input logic avld,
41+
output logic ardy,
42+
input logic [PE-1:0][B_DAT_W-1:0] bdat,
43+
input logic bvld,
44+
output logic brdy,
45+
46+
output logic [PE-1:0][O_WIDTH-1:0] odat,
47+
output logic ovld,
48+
input logic ordy
49+
);
50+
51+
//=== Derived Parameters ===============================================
52+
localparam bit BOTH_FLOAT = A_FLOAT && B_FLOAT;
53+
localparam bit HAVE_SCALE = (B_SCALE != 1.0);
54+
localparam int unsigned BINOPF_LATENCY = HAVE_SCALE? 4 : 2 + IS_MUL;
55+
localparam int unsigned BINOPI_LATENCY = IS_MUL? 3 : 1;
56+
localparam int unsigned CONV_LATENCY = (A_FLOAT ^ B_FLOAT)? 1 : 0;
57+
localparam int unsigned LATENCY = BOTH_INT? BINOPI_LATENCY
58+
: (BINOPF_LATENCY + CONV_LATENCY);
59+
60+
localparam int unsigned CREDIT = LATENCY + 3;
61+
62+
//=== Parameter Validation =============================================
63+
initial begin
64+
if(BOTH_INT && B_SCALE != 1.0) begin
65+
$error("%m: B_SCALE=%f not supported for integer-integer path", B_SCALE);
66+
$finish;
67+
end
68+
if(BOTH_INT && A_SIGNED != B_SIGNED) begin
69+
$error("%m: A_SIGNED must match B_SIGNED for integer-integer path");
70+
$finish;
71+
end
72+
if(BOTH_INT && A_WIDTH != B_WIDTH) begin
73+
$error("%m: A_WIDTH must match B_WIDTH for integer-integer path");
74+
$finish;
75+
end
76+
end
77+
78+
//=== Input Sidestep Registers =========================================
79+
uwire take;
80+
81+
typedef logic [PE-1:0][A_DAT_W-1:0] a_vec_t;
82+
typedef logic [PE-1:0][B_DAT_W-1:0] b_vec_t;
83+
typedef logic [PE-1:0][O_WIDTH-1:0] o_vec_t;
84+
85+
typedef struct {
86+
a_vec_t val;
87+
logic rdy;
88+
} abuf_t;
89+
typedef struct {
90+
b_vec_t val;
91+
logic rdy;
92+
} bbuf_t;
93+
abuf_t A = '{ val: 'x, rdy: '1 };
94+
bbuf_t B = '{ val: 'x, rdy: '1 };
95+
always_ff @(posedge clk) begin
96+
if(rst) begin
97+
A <= '{ val: 'x, rdy: '1 };
98+
B <= '{ val: 'x, rdy: '1 };
99+
end
100+
else begin
101+
if(A.rdy) A.val <= adat;
102+
A.rdy <= (A.rdy && !avld) || take;
103+
if(B.rdy) B.val <= bdat;
104+
B.rdy <= (B.rdy && !bvld) || take;
105+
end
106+
end
107+
assign ardy = A.rdy;
108+
assign brdy = B.rdy;
109+
uwire a_vec_t a = A.rdy? adat : A.val;
110+
uwire b_vec_t b = B.rdy? bdat : B.val;
111+
112+
//=== Credit-based Operation Issue =====================================
113+
logic signed [$clog2(CREDIT):0] Credit = -CREDIT;
114+
uwire give = ovld && ordy;
115+
assign take = (avld || !ardy) && (bvld || !brdy) && Credit[$left(Credit)];
116+
always_ff @(posedge clk) begin
117+
if(rst) Credit <= -CREDIT;
118+
else Credit <= Credit + ((give == take)? 0 : give? -1 : 1);
119+
end
120+
121+
//=== Converter Valid Alignment =======================================
122+
logic Take = 1'b0;
123+
always_ff @(posedge clk) Take <= rst? 1'b0 : take;
124+
125+
//=== Free-running Compute Pipeline ====================================
126+
uwire o_vec_t r;
127+
uwire [PE-1:0] rvld_vec;
128+
uwire rvld;
129+
130+
for(genvar i = 0; i < PE; i++) begin : genPE
131+
132+
if(BOTH_FLOAT) begin : genFF
133+
binopf #(.OP(OP), .B_SCALE(B_SCALE), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core (
134+
.clk, .rst,
135+
.a(a[i]), .avld(take),
136+
.b(b[i]), .bload('1),
137+
.r(r[i]), .rvld(rvld_vec[i])
138+
);
139+
end : genFF
140+
141+
else if(!A_FLOAT && B_FLOAT) begin : genIF
142+
uwire [31:0] a_fp;
143+
int_to_fp32 #(.WIDTH(A_WIDTH), .SIGNED(A_SIGNED)) conv (
144+
.ival(a[i]), .fval(a_fp)
145+
);
146+
logic [31:0] AFp = '0;
147+
logic [31:0] Bd = '0;
148+
always_ff @(posedge clk) begin
149+
if(rst) begin
150+
AFp <= '0; Bd <= '0;
151+
end
152+
else begin
153+
AFp <= a_fp;
154+
Bd <= b[i];
155+
end
156+
end
157+
binopf #(.OP(OP), .B_SCALE(B_SCALE), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core (
158+
.clk, .rst,
159+
.a(AFp), .avld(Take),
160+
.b(Bd), .bload('1),
161+
.r(r[i]), .rvld(rvld_vec[i])
162+
);
163+
end : genIF
164+
165+
else if(A_FLOAT && !B_FLOAT) begin : genFI
166+
uwire [31:0] b_fp;
167+
int_to_fp32 #(.WIDTH(B_WIDTH), .SIGNED(B_SIGNED)) conv (
168+
.ival(b[i]), .fval(b_fp)
169+
);
170+
logic [31:0] BFp = '0;
171+
logic [31:0] Ad = '0;
172+
always_ff @(posedge clk) begin
173+
if(rst) begin
174+
BFp <= '0; Ad <= '0;
175+
end
176+
else begin
177+
BFp <= b_fp;
178+
Ad <= a[i];
179+
end
180+
end
181+
binopf #(.OP(OP), .B_SCALE(B_SCALE), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core (
182+
.clk, .rst,
183+
.a(Ad), .avld(Take),
184+
.b(BFp), .bload('1),
185+
.r(r[i]), .rvld(rvld_vec[i])
186+
);
187+
end : genFI
188+
189+
else begin : genII
190+
binopi #(.OP(OP), .WIDTH(INT_WIDTH), .SIGNED(A_SIGNED)) core (
191+
.clk, .rst,
192+
.a(a[i]), .avld(take),
193+
.b(b[i]), .bload('1),
194+
.r(r[i]), .rvld(rvld_vec[i])
195+
);
196+
end : genII
197+
198+
end : genPE
199+
200+
// All PE results should be valid simultaneously
201+
assign rvld = rvld_vec[0];
202+
always_ff @(posedge clk) begin
203+
assert(rvld_vec == {(PE){rvld}}) else begin
204+
$error("%m: Inconsistent output valid indications.");
205+
$stop;
206+
end
207+
end
208+
209+
//=== Credit-backing Elastic Output Queue ==============================
210+
uwire rrdy;
211+
queue #(.DATA_WIDTH($bits(o_vec_t)), .ELASTICITY(CREDIT)) obuf (
212+
.clk, .rst,
213+
.idat(r), .ivld(rvld), .irdy(rrdy),
214+
.odat, .ovld, .ordy
215+
);
216+
always_ff @(posedge clk) begin
217+
assert(rrdy || !rvld) else begin
218+
$error("%m: Result queue overrun.");
219+
$stop;
220+
end
221+
end
222+
223+
endmodule : eltwise
Lines changed: 16 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
/****************************************************************************
2-
* Copyright (C) 2025, Advanced Micro Devices, Inc.
3-
* All rights reserved.
4-
*
2+
* Copyright Advanced Micro Devices, Inc.
53
* SPDX-License-Identifier: BSD-3-Clause
6-
7-
* @author Shane T. Fleming <shane.fleming@amd.com>
4+
*
5+
* @brief Generalized elementwise wrapper template.
6+
* Supports float/float, int/float, float/int, and int/int paths.
7+
* @author Shane T. Fleming <shane.fleming@amd.com>
88
****************************************************************************/
99

1010
module $TOP_MODULE_NAME$(
@@ -17,24 +17,30 @@ input ap_rst_n,
1717
// -- AXIS input ------------------
1818
output in0_V_TREADY,
1919
input in0_V_TVALID,
20-
input [$STREAM_BITS$-1:0] in0_V_TDATA,
20+
input [$A_STREAM_BITS$-1:0] in0_V_TDATA,
2121

2222
output in1_V_TREADY,
2323
input in1_V_TVALID,
24-
input [$STREAM_BITS$-1:0] in1_V_TDATA,
24+
input [$B_STREAM_BITS$-1:0] in1_V_TDATA,
2525

2626

2727
// -- AXIS output ------------------
2828
input out0_V_TREADY,
2929
output out0_V_TVALID,
30-
output [$STREAM_BITS$-1:0] out0_V_TDATA
30+
output [$O_STREAM_BITS$-1:0] out0_V_TDATA
3131
);
3232

33-
eltwisef #(
33+
eltwise #(
3434
.PE($PE$),
3535
.OP($OP$),
3636
.B_SCALE($B_SCALE$),
37-
.FORCE_BEHAVIORAL($FORCE_BEHAVIORAL$)
37+
.FORCE_BEHAVIORAL($FORCE_BEHAVIORAL$),
38+
.A_FLOAT($A_FLOAT$),
39+
.B_FLOAT($B_FLOAT$),
40+
.A_WIDTH($A_WIDTH$),
41+
.A_SIGNED($A_SIGNED$),
42+
.B_WIDTH($B_WIDTH$),
43+
.B_SIGNED($B_SIGNED$)
3844
) impl (
3945
.clk(ap_clk),
4046
.rst(!ap_rst_n),

0 commit comments

Comments
 (0)