Skip to content

Commit 4c5185e

Browse files
committed
Merge branch 'dev' into custom/transformer
2 parents c904725 + f17ff8e commit 4c5185e

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

50 files changed

+1584
-214
lines changed

.readthedocs.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ version: 2
3636
build:
3737
os: ubuntu-22.04
3838
tools:
39-
python: "3.8"
39+
python: "3.10"
4040

4141
sphinx:
4242
configuration: docs/finn/conf.py

docker/finn_entrypoint.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -155,7 +155,7 @@ else
155155
echo "See https://docs.xilinx.com/r/en-US/ug835-vivado-tcl-commands/Tcl-Initialization-Scripts"
156156
fi
157157

158-
export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$VITIS_PATH/lnx64/tools/fpo_v7_1"
158+
export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$VITIS_PATH/lnx64/tools/fpo_v7_1:$HLS_PATH/lnx64/tools/fpo_v7_1"
159159

160160
export PATH=$PATH:$HOME/.local/bin
161161

docker/jenkins/Jenkinsfile

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,17 @@ pipeline {
66
booleanParam(name: 'end2end', defaultValue: false, description: 'Run end2end tests')
77
}
88
stages {
9+
stage('Prune docker') {
10+
agent {
11+
label 'finn-build'
12+
}
13+
steps {
14+
script {
15+
// Prune old docker containers
16+
sh "docker system prune -a -f"
17+
}
18+
}
19+
}
920
stage('Run Tests') {
1021
parallel {
1122
stage('Sanity - Build Hardware') {

fetch-repos.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ QONNX_COMMIT="9036148104d9a5f731dfd7abc794184bd1671d9f"
3232
FINN_EXP_COMMIT="0724be21111a21f0d81a072fccc1c446e053f851"
3333
BREVITAS_COMMIT="9421f5670f990a0250128cc6285b3901b4f9ef94"
3434
CNPY_COMMIT="8c82362372ce600bbd1cf11d64661ab69d38d7de"
35-
HLSLIB_COMMIT="a19482ba6886f6f26aff11b10126a82ce0dd7ab1"
35+
HLSLIB_COMMIT="bf7b6ce54b0cf8a521c526ae81ddbd2bb64dbd3e"
3636
OMX_COMMIT="a5d48f93309b235fdd21556d16e86e6ef5db6e2e"
3737
AVNET_BDF_COMMIT="2d49cfc25766f07792c0b314489f21fe916b639b"
3838
XIL_BDF_COMMIT="8cf4bb674a919ac34e3d99d8d71a9e60af93d14e"
Lines changed: 135 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,135 @@
1+
/****************************************************************************
2+
* Copyright (C) 2025, Advanced Micro Devices, Inc.
3+
* All rights reserved.
4+
*
5+
* SPDX-License-Identifier: BSD-3-Clause
6+
*
7+
* @brief BRAM with two output registers and streaming interface with backpressure.
8+
* @author Shane T. Fleming
9+
*
10+
* @description
11+
* This module implements a simple BRAM/URAM wrapper with two output
12+
* registers to allow Vivado to fuse them into the BRAM for better timing.
13+
* The read side features streaming interfaces with proper backpressure
14+
* handling via an integrated skid buffer.
15+
*
16+
* The pipeline consists of:
17+
* Address -> BRAM -> Dout1 -> Dout2 -> skid buffer -> output
18+
*
19+
* Backpressure is handled by the skid buffer, which can absorb data
20+
* during temporary downstream stalls while maintaining data ordering.
21+
***************************************************************************/
22+
23+
module elasticmem #(
24+
int unsigned WIDTH,
25+
int unsigned DEPTH,
26+
int unsigned FEED_STAGES = 0,
27+
parameter RAM_STYLE = "auto"
28+
)(
29+
input logic clk,
30+
input logic rst,
31+
32+
// Write port (simple, no handshake)
33+
input logic [WIDTH-1:0] wr_data,
34+
input logic [$clog2(DEPTH)-1:0] wr_addr,
35+
input logic wr_en,
36+
37+
// Read request channel (address)
38+
input logic [$clog2(DEPTH)-1:0] rd_addr,
39+
input logic rd_req_vld,
40+
output logic rd_req_rdy,
41+
42+
// Read data channel (downstream)
43+
output logic [WIDTH-1:0] rd_dat,
44+
output logic rd_dat_vld,
45+
input logic rd_dat_rdy
46+
);
47+
48+
//-----------------------------------------------------------------------
49+
// BRAM and Pipeline Stages
50+
(* ram_style = RAM_STYLE *)
51+
logic [WIDTH-1:0] Mem [DEPTH-1:0];
52+
53+
// Pipeline registers
54+
logic [$clog2(DEPTH)-1:0] AddrReg = 'x;
55+
logic AddrVld = 0;
56+
logic [WIDTH-1:0] Dout1 = 'x;
57+
logic Dout1Vld = 0;
58+
logic [WIDTH-1:0] Dout2 = 'x;
59+
logic Dout2Vld = 0;
60+
61+
//-----------------------------------------------------------------------
62+
// Write Port
63+
always_ff @(posedge clk) begin
64+
if(wr_en) begin
65+
Mem[wr_addr] <= wr_data;
66+
end
67+
end
68+
69+
//-----------------------------------------------------------------------
70+
// Pipeline Control Logic
71+
logic skid_irdy;
72+
73+
uwire stage2_advance = !Dout2Vld || skid_irdy;
74+
uwire stage1_advance = !Dout1Vld || stage2_advance;
75+
uwire stage0_advance = !AddrVld || stage1_advance;
76+
assign rd_req_rdy = stage0_advance;
77+
78+
//-----------------------------------------------------------------------
79+
// Stage 0: Address Register
80+
always_ff @(posedge clk) begin
81+
if(rst) begin
82+
AddrReg <= 'x;
83+
AddrVld <= 0;
84+
end
85+
else if(stage0_advance) begin
86+
AddrReg <= rd_addr;
87+
AddrVld <= rd_req_vld;
88+
end
89+
end
90+
91+
//-----------------------------------------------------------------------
92+
// Stage 1: First Memory Output Register (BRAM output)
93+
always_ff @(posedge clk) begin
94+
if(rst) begin
95+
Dout1 <= 'x;
96+
Dout1Vld <= 0;
97+
end
98+
else if(stage1_advance) begin
99+
Dout1Vld <= AddrVld;
100+
Dout1 <= Mem[AddrReg];
101+
end
102+
end
103+
104+
//-----------------------------------------------------------------------
105+
// Stage 2: Second Output Register (candidate for BRAM fusion)
106+
always_ff @(posedge clk) begin
107+
if(rst) begin
108+
Dout2 <= 'x;
109+
Dout2Vld <= 0;
110+
end
111+
else if(stage2_advance) begin
112+
Dout2Vld <= Dout1Vld;
113+
Dout2 <= Dout1;
114+
end
115+
end
116+
117+
//-----------------------------------------------------------------------
118+
// Skid Buffer for Backpressure Handling
119+
skid #(
120+
.DATA_WIDTH (WIDTH),
121+
.FEED_STAGES(FEED_STAGES)
122+
) u_skid (
123+
.clk (clk),
124+
.rst (rst),
125+
126+
.idat (Dout2),
127+
.ivld (Dout2Vld),
128+
.irdy (skid_irdy),
129+
130+
.odat (rd_dat),
131+
.ovld (rd_dat_vld),
132+
.ordy (rd_dat_rdy)
133+
);
134+
135+
endmodule : elasticmem

finn-rtllib/inner_shuffle/inner_shuffle.sv

Lines changed: 53 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -56,34 +56,6 @@
5656
* enabling both the write path and the read path to be active simultaneously.
5757
****************************************************************************/
5858

59-
// A memory bank in the inner_shuffle design. Pattern was kept as simple
60-
// as possible to help with Vivado BRAM inference.
61-
module mem_bank #(
62-
int unsigned WIDTH,
63-
int unsigned DEPTH,
64-
parameter RAM_STYLE = "auto"
65-
)(
66-
input logic clk,
67-
68-
input logic [WIDTH-1:0] d_in,
69-
input logic [$clog2(DEPTH)-1:0] wr_addr,
70-
input logic wr_en,
71-
72-
output logic [WIDTH-1:0] d_out,
73-
input logic [$clog2(DEPTH)-1:0] rd_addr,
74-
input logic rd_en
75-
);
76-
77-
(* ram_style=RAM_STYLE *)
78-
logic [WIDTH-1:0] Mem [DEPTH-1:0]; // The Mem for this bank
79-
always_ff @(posedge clk) begin
80-
if(wr_en) Mem[wr_addr] <= d_in;
81-
if(rd_en) d_out <= Mem[rd_addr];
82-
end
83-
84-
endmodule : mem_bank
85-
86-
8759
// ----------------------------------------
8860
// Parallel Transpose Unit (InnerShuffle)
8961
// ----------------------------------------
@@ -124,23 +96,37 @@ module inner_shuffle #(
12496
localparam int unsigned PAGE_OFFSET = I*J / SIMD;
12597
uwire wr_en = irdy && ivld;
12698
uwire [$clog2(BANK_DEPTH)-1:0] wr_addr;
127-
uwire rd_en;
12899
uwire [BITS-1:0] d_in [SIMD-1:0];
129100
uwire [BITS-1:0] d_out [SIMD-1:0];
101+
uwire rd_req_en;
102+
uwire [SIMD-1:0] rd_req_rdy;
103+
uwire rd_req_rdy_all = &rd_req_rdy;
130104
logic [$clog2(BANK_DEPTH)-1:0] raddr [SIMD-1:0];
105+
uwire rd_dat_vld [SIMD-1:0];
106+
uwire rd_dat_rdy;
107+
assign rd_dat_rdy = osb_rdy && rd_pattern_sb_ovld;
108+
uwire rd_pattern_sb_irdy;
109+
uwire rd_pattern_sb_ovld;
131110
for(genvar i = 0; i<SIMD; i++) begin : gen_mem_banks
132-
mem_bank #(
111+
elasticmem #(
133112
.WIDTH(BITS),
134113
.DEPTH(BANK_DEPTH),
135114
.RAM_STYLE(RAM_STYLE)
136-
) mem_bank_inst (
137-
.clk,
138-
.wr_en,
139-
.wr_addr,
140-
.d_in(d_in[i]),
141-
.rd_en,
115+
) mem_banks_inst (
116+
.clk(clk),
117+
.rst(rst),
118+
119+
.wr_data(d_in[i]),
120+
.wr_addr(wr_addr),
121+
.wr_en(wr_en),
122+
142123
.rd_addr(raddr[i]),
143-
.d_out(d_out[i])
124+
.rd_req_vld(rd_req_en),
125+
.rd_req_rdy(rd_req_rdy[i]),
126+
127+
.rd_dat(d_out[i]),
128+
.rd_dat_vld(rd_dat_vld[i]),
129+
.rd_dat_rdy(rd_dat_rdy)
144130
);
145131
end : gen_mem_banks
146132

@@ -221,6 +207,7 @@ module inner_shuffle #(
221207

222208
localparam int unsigned RD_ROT_PERIOD = I / SIMD; // (I % SIMD == 0) is a constraint
223209
typedef logic [$clog2(SIMD)-1:0] rotidx_vec_t[SIMD-1:0];
210+
typedef logic [SIMD*$clog2(SIMD)-1:0] packed_rotidx_vec_t;
224211
typedef logic [$clog2(BANK_DEPTH)-1:0] bank_addr_t [SIMD-1:0];
225212

226213
// --------------------------------------------------------------------------
@@ -279,7 +266,7 @@ module inner_shuffle #(
279266
uwire [$clog2(BANK_DEPTH)-1:0] page_rd_offset;
280267
uwire osb_rdy; // output skid buffer ready signal
281268
uwire rd_guard = !CurrentPageRd && !WrJobsDone[0] && !WrJobsDone[1];
282-
uwire rd_inc = osb_rdy & !rd_guard;
269+
uwire rd_inc = rd_req_en && rd_req_rdy_all;
283270

284271
// Counts reads across the columns
285272
logic[$clog2(I)-1 : 0] RdICnt = 0; // 0, ..., I - 1
@@ -349,9 +336,7 @@ module inner_shuffle #(
349336
// --------------------------------------------------------------------------
350337
// Page management
351338

352-
logic OsbVld = 0; // output skidbuffer valid signal
353-
logic OsbVld_D = 0; // output skidbuffer valid signal
354-
assign rd_en = osb_rdy;
339+
assign rd_req_en = !rd_guard && rd_pattern_sb_irdy; // We can read once the guard is not up
355340

356341
always_ff @(posedge clk) begin
357342
if (rst) begin
@@ -363,37 +348,26 @@ module inner_shuffle #(
363348
if (wr_addr == 2*PAGE_OFFSET - 1) WrJobsDone[1] <= 1;
364349

365350
// Clear the relevant job once it is read
366-
if (page_boundary && (osb_rdy && OsbVld)) begin
351+
if (page_boundary && (rd_req_en && rd_req_rdy_all)) begin
367352
WrJobsDone[CurrentPageRd] <= 0;
368353
CurrentPageRd <= !CurrentPageRd;
369354
end
370355
end
371356
end
372357

373-
assign page_rd_offset = CurrentPageRd ? 0: PAGE_OFFSET;
358+
assign page_rd_offset = CurrentPageRd ? 0 : PAGE_OFFSET;
374359
assign irdy = !WrJobsDone[0] || !WrJobsDone[1];
375360

376361
// Forward the current RD_PATTERN row onto the next pipeline stage
377362
rotidx_vec_t RdPat = RD_INIT_PAT;
378-
rotidx_vec_t RdPat_D = RD_INIT_PAT; // The fowarded rotation pattern
379-
always_ff @(posedge clk) begin : rd_pattern_col_forwarding
380-
if (rst) begin
381-
OsbVld <= 0;
382-
RdPat_D <= RD_INIT_PAT;
383-
end
384-
else begin
385-
OsbVld <= !rd_guard;
386-
OsbVld_D <= OsbVld;
387-
if (rd_inc) RdPat_D <= RdPat;
388-
if(OsbVld & rd_guard & !osb_rdy) OsbVld <= 1;
389-
end
390-
end : rd_pattern_col_forwarding
363+
packed_rotidx_vec_t rd_pat_forwarded_packed;
364+
rotidx_vec_t rd_pat_forwarded;
391365

392366
// Structural remapping using the output of the memory banks
393367
// and the Read rotation from the previous clock cycle that was
394368
// used to generate the read addresses.
395369
uwire [SIMD-1:0][BITS-1:0] remapped_data; // remapped output
396-
for(genvar i=0; i<SIMD; i++) assign remapped_data[i] = d_out[RdPat_D[i]];
370+
for(genvar i=0; i<SIMD; i++) assign remapped_data[i] = d_out[rd_pat_forwarded[i]];
397371

398372
// the next permutation of the rd pattern
399373
rotidx_vec_t rd_pat_next;
@@ -433,11 +407,32 @@ module inner_shuffle #(
433407
end
434408
end
435409
end : rd_pattern_assignment
410+
411+
412+
//=======================================================================
413+
skid #(
414+
.DATA_WIDTH($bits(packed_rotidx_vec_t))
415+
)
416+
rd_pattern_skid (
417+
.clk(clk),
418+
.rst(rst),
419+
420+
.idat(packed_rotidx_vec_t'(RdPat)),
421+
.ivld(rd_req_en),
422+
.irdy(rd_pattern_sb_irdy),
423+
424+
.odat(rd_pat_forwarded_packed),
425+
.ovld(rd_pattern_sb_ovld),
426+
.ordy(osb_rdy && rd_dat_vld[0])
427+
);
428+
429+
assign rd_pat_forwarded = rotidx_vec_t'(rd_pat_forwarded_packed);
436430
// --------------------------------------------------------------------------
437431

438432
//=======================================================================
439433
// Output SkidBuffer -- Used to decouple control signals for timing
440434
// improvements
435+
uwire osb_vld = rd_pattern_sb_ovld && rd_dat_vld[0];
441436
skid #(
442437
.DATA_WIDTH(SIMD*BITS)
443438
)
@@ -446,7 +441,7 @@ module inner_shuffle #(
446441
.rst(rst),
447442

448443
.idat(remapped_data),
449-
.ivld(OsbVld),
444+
.ivld(osb_vld),
450445
.irdy(osb_rdy),
451446

452447
.odat(odat),

0 commit comments

Comments
 (0)