Skip to content

Commit 9c6946d

Browse files
authored
Merge pull request #1343 from Xilinx/tpreusse.thresholding
Allow thresholding into a range whose size is not a full power of two.
2 parents 474998b + e75bea0 commit 9c6946d

7 files changed

Lines changed: 207 additions & 155 deletions

File tree

finn-rtllib/thresholding/hdl/axilite_if.v

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@
2929
* ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
3030
*****************************************************************************/
3131

32-
module axi4lite_if
32+
module axilite_if
3333
#(
3434
parameter ADDR_WIDTH = 32,
3535
parameter DATA_WIDTH = 32,//AXI4 spec requires this to be strictly 32 or 64

finn-rtllib/thresholding/hdl/thresholding.sv

Lines changed: 60 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -32,34 +32,34 @@
3232
* @author Thomas B. Preußer <thomas.preusser@amd.com>
3333
*
3434
* @description
35-
* Produces the N-bit count of those among 2^N-1 thresholds that are not
35+
* Produces the count of those among N thresholds that are not
3636
* larger than the corresponding input:
3737
* y = Σ(T_i <= x)
3838
* The result is computed by binary search. The runtime-configurable
39-
* thresholds must be written in ascending order:
39+
* thresholds must be sorted in ascending order:
4040
* i < j => T_i < T_j
4141
* The design supports channel folding allowing each input to be processed
4242
* with respect to a selectable set of thresholds. The corresponding
4343
* threshold configuration relies on a channel address prefix. Inputs are
4444
* accompanied by a channel selector.
4545
*
4646
* Parameter Layout as seen on AXI-Lite (row by row):
47-
* | Base \ Offs | 0 1 2 ... 2^N-2 2^N-1
48-
* ---------+--------------------------------+------------------------------------
49-
* Chnl #0 | 0 | T_0 T_1 T_2 ... T_{2^N-2} 'x
50-
* Chnl #1 | 2^N | T_0 T_1 T_2 ... T_{2^N-2} 'x
51-
* Chnl #c | ((c/PE)*$clog2(PE) + c%PE)*2^N | T_0 T_1 T_2 ... T_{2^N-2} 'x
47+
* | Base \ Offs | 0 1 2 ... N-1 ...
48+
* ---------+----------------------------------------+---------------------------------
49+
* Chnl #0 | 0 | T_0 T_1 T_2 ... T_{N-1} 'x
50+
* Chnl #1 | 2^$clog2(N) | T_0 T_1 T_2 ... T_{N-1} 'x
51+
* Chnl #c | ((c/PE)*$clog2(PE) + c%PE)*2^$clog2(N) | T_0 T_1 T_2 ... T_{N-1} 'x
5252
*
5353
*****************************************************************************/
5454
module thresholding #(
55-
int unsigned N, // output precision
5655
int unsigned K, // input/threshold precision
56+
int unsigned N, // number of thresholds
5757
int unsigned C, // number of channels
5858
int unsigned PE, // parallel processing elements
5959

6060
bit SIGNED = 1, // signed inputs
6161
bit FPARG = 0, // floating-point inputs: [sign] | exponent | mantissa
62-
int BIAS = 0, // offsetting the output [0, 2^N-1] -> [BIAS, 2^N-1 + BIAS]
62+
int BIAS = 0, // offsetting the output [0, N] -> [BIAS, N+BIAS]
6363

6464
// Initial Thresholds
6565
parameter THRESHOLDS_PATH = "",
@@ -72,8 +72,8 @@ module thresholding #(
7272

7373
localparam int unsigned CF = C/PE, // Channel fold
7474
localparam int unsigned O_BITS = BIAS >= 0?
75-
/* unsigned */ $clog2(2**N+BIAS) :
76-
/* signed */ 1+$clog2(-BIAS >= 2**(N-1)? -BIAS : 2**N+BIAS)
75+
/* unsigned */ $clog2(N+BIAS+1) :
76+
/* signed */ 1+$clog2(-BIAS >= N+BIAS+1? -BIAS : N+BIAS+1)
7777
)(
7878
// Global Control
7979
input logic clk,
@@ -82,7 +82,7 @@ module thresholding #(
8282
// Threshold Configuration
8383
input logic cfg_en,
8484
input logic cfg_we,
85-
input logic [$clog2(CF)+$clog2(PE)+N-1:0] cfg_a,
85+
input logic [$clog2(CF)+$clog2(PE)+$clog2(N)-1:0] cfg_a,
8686
input logic [K-1:0] cfg_d,
8787
output logic cfg_rack,
8888
output logic [K-1:0] cfg_q,
@@ -115,22 +115,25 @@ module thresholding #(
115115
CFG = 2'b1x // Config op (pointer-preserving)
116116
} op_e;
117117

118+
//-----------------------------------------------------------------------
119+
// Pipeline Feed
120+
// - M := $clog2(N+1) pipeline stages
121+
// - configuration always takes precedence
122+
// - number of pending thresholding ops capped to M+3
123+
// across pipeline and output FIFO: pipe:M + A:1 + B:1 + 1
124+
localparam int unsigned M = $clog2(N+1);
125+
localparam int unsigned MAX_PENDING = (DEEP_PIPELINE+1)*M + 3;
126+
118127
// Pipeline Link Type
119-
typedef logic [$clog2(CF)+N-1:0] ptr_t;
128+
typedef logic [$clog2(CF)+M-1:0] ptr_t;
120129
typedef logic [K -1:0] val_t;
121130
typedef struct packed {
122131
op_e op;
123132
ptr_t ptr; // WR/RB: address; TH: result
124133
val_t val; // WR/RB: threshold value; TH: input value
125134
} pipe_t;
126135

127-
//-----------------------------------------------------------------------
128-
// Pipeline Feed
129-
// - configuration always takes precedence
130-
// - number of pending thresholding ops capped to N+3
131-
// across pipeline and output FIFO: pipe:N + A:1 + B:1 + 1
132-
localparam int unsigned MAX_PENDING = (DEEP_PIPELINE+1)*N + 3;
133-
pipe_t pipe[PE][N+1];
136+
pipe_t pipe[PE][M+1];
134137
if(1) begin : blkFeed
135138

136139
// Thresholding Input Guard ensuring Output FIFO is never overrun
@@ -148,20 +151,20 @@ module thresholding #(
148151
// PE Configuration Address Decoding
149152
logic cfg_sel[PE];
150153
logic cfg_oob;
151-
logic [N-1:0] cfg_ofs;
154+
logic [$clog2(N)-1:0] cfg_ofs;
152155
if(PE == 1) begin
153156
assign cfg_sel[0] = 1;
154157
assign cfg_oob = 0;
155-
assign cfg_ofs = cfg_a[0+:N];
158+
assign cfg_ofs = cfg_a[0+:$clog2(N)];
156159
end
157160
else begin
158-
uwire [$clog2(PE)-1:0] cfg_pe = cfg_a[N+:$clog2(PE)];
161+
uwire [$clog2(PE)-1:0] cfg_pe = cfg_a[$clog2(N)+:$clog2(PE)];
159162
always_comb begin
160163
foreach(cfg_sel[pe]) begin
161164
cfg_sel[pe] = USE_CONFIG && cfg_en && (cfg_pe == pe);
162165
end
163166
cfg_oob = (cfg_pe >= PE);
164-
cfg_ofs = cfg_a[0+:N];
167+
cfg_ofs = cfg_a[0+:$clog2(N)];
165168
if(cfg_oob && !cfg_we) begin
166169
// Map readbacks from padded rows (non-existent PEs) to padded highest threshold index of first PE
167170
cfg_sel[0] = 1;
@@ -171,7 +174,7 @@ module thresholding #(
171174
end
172175

173176
uwire ptr_t iptr;
174-
assign iptr[0+:N] = cfg_ofs;
177+
assign iptr[0+:M] = cfg_ofs; // Zero-extend Expand for N = 2^k
175178
if(CF > 1) begin
176179
// Channel Fold Rotation
177180
logic [$clog2(CF)-1:0] CnlCnt = 0;
@@ -187,7 +190,7 @@ module thresholding #(
187190
end
188191
end
189192

190-
assign iptr[N+:$clog2(CF)] = USE_CONFIG && cfg_en? cfg_a[N+$clog2(PE)+:$clog2(CF)] : CnlCnt;
193+
assign iptr[M+:$clog2(CF)] = USE_CONFIG && cfg_en? cfg_a[$clog2(N)+$clog2(PE)+:$clog2(CF)] : CnlCnt;
191194
end
192195

193196
for(genvar pe = 0; pe < PE; pe++) begin
@@ -205,9 +208,9 @@ module thresholding #(
205208

206209
//-----------------------------------------------------------------------
207210
// Free-Running Thresholding Pipeline
208-
for(genvar stage = 0; stage < N; stage++) begin : genStages
211+
for(genvar stage = 0; stage < M; stage++) begin : genStages
209212

210-
localparam int unsigned SN = N-1-stage;
213+
localparam int unsigned SN = M-1-stage;
211214
for(genvar pe = 0; pe < PE; pe++) begin : genPE
212215
uwire pipe_t p = pipe[pe][stage];
213216
uwire cs = (p.ptr[SN:0] == 2**SN-1);
@@ -222,7 +225,7 @@ module thresholding #(
222225
// If BRAM trigger defined, force distributed memory below if Vivado may be tempted to use BRAM nonetheless.
223226
DEPTH_TRIGGER_BRAM && (DEPTH >= 64)? "distributed" : "auto";
224227

225-
(* DONT_TOUCH = "true", RAM_STYLE = RAM_STYLE *)
228+
(* DONT_TOUCH = "true", RAM_STYLE = RAM_STYLE *)
226229
val_t Threshs[DEPTH];
227230
if(THRESHOLDS_PATH != "") begin
228231
initial $readmemh($sformatf("%sthreshs_%0d_%0d.dat", THRESHOLDS_PATH, pe, stage), Threshs);
@@ -236,7 +239,7 @@ module thresholding #(
236239
end
237240
end
238241
else begin
239-
uwire [$clog2(CF)+stage-1:0] addr = p.ptr[$clog2(CF)+N-1:SN+1];
242+
uwire [$clog2(CF)+stage-1:0] addr = p.ptr[$clog2(CF)+M-1:SN+1];
240243
always @(posedge clk) begin
241244
if(we) Threshs[addr] <= p.val;
242245
end
@@ -247,7 +250,7 @@ module thresholding #(
247250
assign Thresh = Threshs[0];
248251
end
249252
else begin
250-
uwire [$clog2(CF)+stage-1:0] addr = p.ptr[$clog2(CF)+N-1:SN+1];
253+
uwire [$clog2(CF)+stage-1:0] addr = p.ptr[$clog2(CF)+M-1:SN+1];
251254
always_ff @(posedge clk) begin
252255
Thresh <= Threshs[addr];
253256
end
@@ -256,19 +259,30 @@ module thresholding #(
256259
end : blkThresh
257260

258261
// Pipeline State
262+
localparam int unsigned SCOPE_REDUCE = (2**(M-stage-1) + 2**M-1-N) >> (M-stage);
259263
pipe_t P = '{ op: NOP, default: 'x };
260-
logic Reval = 0;
264+
logic Reval = 'x; // Replace value by readback
265+
logic Scope = 'x; // Comparison in scope of specified threshold count
261266
always_ff @(posedge clk) begin
262267
if(rst) begin
263268
P <= '{ op: NOP, default: 'x };
264-
Reval <= 0;
269+
Reval <= 'x;
270+
Scope <= 'x;
265271
end
266272
else begin
267273
P <= p;
268274
Reval <= (p.op ==? RB) && cs;
275+
Scope <= (SCOPE_REDUCE == 0)? 1 : p.ptr[M-1:SN+1] < 2**stage - SCOPE_REDUCE;
269276
end
270277
end
271278

279+
always_ff @(posedge clk) begin
280+
assert((P.op !=? TH) || (Scope !== 1'bx)) else begin
281+
$error("%m: [%0d.%0d] Broken Scope.", pe, stage);
282+
end
283+
end
284+
285+
// Mask comparisons beyond specified threshold count
272286
logic cmp;
273287
if(!SIGNED) assign cmp = $unsigned(Thresh) <= $unsigned(P.val);
274288
else if(!FPARG) assign cmp = $signed(Thresh) <= $signed(P.val);
@@ -290,7 +304,7 @@ module thresholding #(
290304
pipe_t pp;
291305
always_comb begin
292306
pp = P;
293-
if(P.op !=? CFG) pp.ptr[SN] = cmp;
307+
if(P.op !=? CFG) pp.ptr[SN] = Scope && cmp;
294308
if(Reval) pp.val = Thresh;
295309
end
296310

@@ -301,7 +315,12 @@ module thresholding #(
301315
pipe_t Pf = '{ op: NOP, default: 'x };
302316
always_ff @(posedge clk) begin
303317
if(rst) Pf <= '{ op: NOP, default: 'x };
304-
else Pf <= pp;
318+
else begin
319+
assert((pp.op !=? TH) || (^pp.ptr[$left(ptr_t):SN] !== 1'bx)) else begin
320+
$error("%m: [%0d.%0d] Broken ptr[$left:%0d].", pe, stage, SN);
321+
end
322+
Pf <= pp;
323+
end
305324
end
306325
assign pf = Pf;
307326
end
@@ -317,34 +336,34 @@ module thresholding #(
317336
cfg_rack = 0;
318337
cfg_q = 0;
319338
foreach(pipe[pe]) begin
320-
automatic pipe_t p = pipe[pe][N];
339+
automatic pipe_t p = pipe[pe][M];
321340
cfg_rack |= p.op ==? RB;
322341
cfg_q |= p.val;
323342
end
324343
end
325344

326345
//-----------------------------------------------------------------------
327346
// Stream Output through FIFO
328-
// - Depth of N + Output Reg to allow pipe to drain entirely under backpressure
347+
// - Depth of M + Output Reg to allow pipe to drain entirely under backpressure
329348
// - Typically mapped to an SRL shift register
330349
if(1) begin : blkStreamOutput
331350
localparam int unsigned A_DEPTH = MAX_PENDING - 1;
332-
logic [PE-1 : 0][N-1 : 0] ADat[A_DEPTH];
351+
logic [PE-1 : 0][M-1 : 0] ADat[A_DEPTH];
333352
logic signed [$clog2(A_DEPTH):0] APtr = '1; // -1, 0, 1, ..., A_DEPTH-1
334353
uwire avld = !APtr[$left(APtr)];
335354

336-
logic [PE-1:0][N-1:0] BDat = 'x;
355+
logic [PE-1:0][M-1:0] BDat = 'x;
337356
logic BVld = 0;
338357

339-
uwire aload = pipe[0][N].op ==? TH;
358+
uwire aload = pipe[0][M].op ==? TH;
340359
uwire bload = !BVld || ordy;
341360

342361
always_ff @(posedge clk) begin
343362
if(aload) begin
344363
assert(APtr < $signed(A_DEPTH-1)) else begin
345364
$error("Overrun after failing stream guard.");
346365
end
347-
foreach(pipe[pe]) ADat[0][pe] <= pipe[pe][N].ptr;
366+
foreach(pipe[pe]) ADat[0][pe] <= pipe[pe][M].ptr;
348367
for(int unsigned i = 1; i < A_DEPTH; i++) ADat[i] <= ADat[i-1];
349368
end
350369
end

finn-rtllib/thresholding/hdl/thresholding_axi.sv

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -39,9 +39,9 @@
3939
*****************************************************************************/
4040

4141
module thresholding_axi #(
42-
int unsigned N, // output precision
4342
int unsigned WI, // input precision
4443
int unsigned WT, // threshold precision
44+
int unsigned N, // number of thresholds
4545
int unsigned C = 1, // Channels
4646
int unsigned PE = 1, // Processing Parallelism, requires C = k*PE
4747

@@ -51,7 +51,6 @@ module thresholding_axi #(
5151

5252
// Initial Thresholds
5353
parameter THRESHOLDS_PATH = "",
54-
5554
bit USE_AXILITE, // Implement AXI-Lite for threshold read/write
5655

5756
// Force Use of On-Chip Memory Blocks
@@ -60,10 +59,10 @@ module thresholding_axi #(
6059
bit DEEP_PIPELINE = 0,
6160

6261
localparam int unsigned CF = C/PE, // Channel Fold
63-
localparam int unsigned ADDR_BITS = $clog2(CF) + $clog2(PE) + N + 2,
62+
localparam int unsigned ADDR_BITS = $clog2(CF) + $clog2(PE) + $clog2(N) + 2,
6463
localparam int unsigned O_BITS = BIAS >= 0?
65-
/* unsigned */ $clog2(2**N+BIAS) :
66-
/* signed */ 1+$clog2(-BIAS >= 2**(N-1)? -BIAS : 2**N+BIAS)
64+
/* unsigned */ $clog2(N+BIAS+1) :
65+
/* signed */ 1+$clog2(-BIAS >= N+BIAS+1? -BIAS : N+BIAS+1)
6766
)(
6867
//- Global Control ------------------
6968
input logic ap_clk,
@@ -116,7 +115,7 @@ module thresholding_axi #(
116115

117116
if(USE_AXILITE) begin
118117
uwire [ADDR_BITS-1:0] cfg_a0;
119-
axi4lite_if #(.ADDR_WIDTH(ADDR_BITS), .DATA_WIDTH(32), .IP_DATA_WIDTH(WT)) axi (
118+
axilite_if #(.ADDR_WIDTH(ADDR_BITS), .DATA_WIDTH(32), .IP_DATA_WIDTH(WT)) axi (
120119
.aclk(ap_clk), .aresetn(ap_rst_n),
121120

122121
.awready(s_axilite_AWREADY), .awvalid(s_axilite_AWVALID), .awaddr(s_axilite_AWADDR), .awprot('x),
@@ -178,7 +177,7 @@ module thresholding_axi #(
178177
//-----------------------------------------------------------------------
179178
// Kernel Implementation
180179
thresholding #(
181-
.N(N), .K(WT), .C(C), .PE(PE),
180+
.K(WT), .N(N), .C(C), .PE(PE),
182181
.SIGNED(SIGNED), .FPARG(FPARG), .BIAS(BIAS),
183182
.THRESHOLDS_PATH(THRESHOLDS_PATH), .USE_CONFIG(USE_AXILITE),
184183
.DEPTH_TRIGGER_URAM(DEPTH_TRIGGER_URAM), .DEPTH_TRIGGER_BRAM(DEPTH_TRIGGER_BRAM),

finn-rtllib/thresholding/hdl/thresholding_template_wrapper.v

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -33,9 +33,9 @@
3333
*/
3434

3535
module $MODULE_NAME_AXI_WRAPPER$ #(
36-
parameter N = $N$, // output precision
3736
parameter WI = $WI$, // input precision
3837
parameter WT = $WT$, // threshold precision
38+
parameter N = $N$, // number of thresholds
3939
parameter C = $C$, // Channels
4040
parameter PE = $PE$, // Processing Parallelism, requires C = k*PE
4141

@@ -64,7 +64,7 @@ module $MODULE_NAME_AXI_WRAPPER$ #(
6464
// Writing
6565
input s_axilite_AWVALID,
6666
output s_axilite_AWREADY,
67-
input [$clog2(C/PE) + $clog2(PE) + N + 1:0] s_axilite_AWADDR, // lowest 2 bits (byte selectors) are ignored
67+
input [$clog2(C/PE) + $clog2(PE) + $clog2(N) + 1:0] s_axilite_AWADDR, // lowest 2 bits (byte selectors) are ignored
6868

6969
input s_axilite_WVALID,
7070
output s_axilite_WREADY,
@@ -78,7 +78,7 @@ module $MODULE_NAME_AXI_WRAPPER$ #(
7878
// Reading
7979
input s_axilite_ARVALID,
8080
output s_axilite_ARREADY,
81-
input [$clog2(C/PE) + $clog2(PE) + N + 1:0] s_axilite_ARADDR,
81+
input [$clog2(C/PE) + $clog2(PE) + $clog2(N) + 1:0] s_axilite_ARADDR,
8282

8383
output s_axilite_RVALID,
8484
input s_axilite_RREADY,

0 commit comments

Comments
 (0)