Skip to content

Commit 95a287f

Browse files
authored
Integrate Instrumentation into Zynq Backend (#45)
* Integrates instrumentation wrapper as alternative to IODMA shell for ZynqBuild, controlled via enable_instrumentation option
1 parent 2a3db61 commit 95a287f

16 files changed

Lines changed: 1219 additions & 43 deletions

.gitlab-ci.yml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -93,11 +93,11 @@ Sync finn-dev:
9393

9494
.setup_venv_from_whl: &setup_venv_from_whl
9595
# Move everything to working directory (e.g., RAMdisk)
96-
- cp -dfR .. $PATH_WORKDIR
96+
- cp -dfR . $PATH_WORKDIR
9797
- cd $PATH_WORKDIR
9898
# Create fresh virtual environment and install finn-plus from .whl (artifact)
9999
- python3 -m venv finn-plus-venv
100-
- finn-plus-venv/bin/pip install ./finn-plus/dist/*.whl
100+
- finn-plus-venv/bin/pip install dist/*.whl
101101

102102
Build:
103103
id_tokens:
@@ -171,8 +171,8 @@ FINN Test Suite 2022.2:
171171
- $JOB_MONITORING_DIR/monitor.sh $JOB_MONITORING_DIR/$CI_PIPELINE_ID/$HOSTNAME.log &
172172
# Launch FINN via test command, includes preparation of (cached) dependencies
173173
- |
174-
source ./finn-plus-venv/bin/activate
175-
finn test --variant $TEST_SUITE --dependency-path ./finn-plus/deps --build-path $FINN_BUILD_DIR --num-workers 1 --num-test-workers $PYTEST_PARALLEL
174+
source finn-plus-venv/bin/activate
175+
finn test --variant $TEST_SUITE --dependency-path ./deps --build-path $FINN_BUILD_DIR --num-workers 1 --num-test-workers $PYTEST_PARALLEL
176176
artifacts:
177177
name: "test_reports"
178178
when: always
Lines changed: 307 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,307 @@
1+
/******************************************************************************
2+
* Copyright (c) 2023, Xilinx, Inc.
3+
* All rights reserved.
4+
*
5+
* Redistribution and use in source and binary forms, with or without
6+
* modification, are permitted provided that the following conditions are met:
7+
*
8+
* 1. Redistributions of source code must retain the above copyright notice,
9+
* this list of conditions and the following disclaimer.
10+
*
11+
* 2. Redistributions in binary form must reproduce the above copyright
12+
* notice, this list of conditions and the following disclaimer in the
13+
* documentation and/or other materials provided with the distribution.
14+
*
15+
* 3. Neither the name of the copyright holder nor the names of its
16+
* contributors may be used to endorse or promote products derived from
17+
* this software without specific prior written permission.
18+
*
19+
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20+
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
21+
* THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22+
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
23+
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
24+
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
25+
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
26+
* OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
27+
* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
28+
* OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
29+
* ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30+
*******************************************************************************
31+
* @brief Instrumentation wrapper module for FINN IP characterization.
32+
* @author Thomas B. Preusser <thomas.preusser@amd.com>
33+
* @details
34+
* Instrumentation wrapper intercepting the feature map input to and
35+
* the feature map output from a FINN IP to measure processing latency and
36+
* initiation interval in terms of clock cycles. The most recent readings
37+
* are exposed via AXI-light.
38+
* This wrapper can run the FINN IP detached from an external data source
39+
* and sink by feeding LFSR-generated data and sinking the output without
40+
* backpressure.
41+
* This module is currently not integrated with the FINN compiler. It must
42+
* be instantiated and integrated with the rest of the system in a manual
43+
* process.
44+
*
45+
* @param PENDING maximum number of feature maps in the FINN dataflow pipeline
46+
* @param ILEN number of input transactions per IFM
47+
* @param OLEN number of output transactions per OFM
48+
* @param KO number of subwords within output payload vector
49+
* @param TI type of input payload vector
50+
* @param TO type of output payload vector
51+
*******************************************************************************/
52+
53+
#include <hls_stream.h>
54+
#include <ap_int.h>
55+
#include <ap_axi_sdata.h>
56+
#include <algorithm>
57+
58+
// Module Configuration
59+
constexpr unsigned PENDING = @PENDING@; // Max. feature maps in flight
60+
constexpr unsigned ILEN = @ILEN@; // Input words per IFM
61+
constexpr unsigned OLEN = @OLEN@; // Output words per OFM
62+
constexpr unsigned KO = @KO@; // Subwords within OFM transaction word
63+
using TI = @TI@; // IFM transaction word
64+
using TO = @TO@; // OFM transaction word
65+
66+
//---------------------------------------------------------------------------
67+
// Utility Functions
68+
static constexpr unsigned clog2 (unsigned x) { return x<2? 0 : 1+clog2((x+1)/2); }
69+
static constexpr unsigned clog2nz(unsigned x) { return std::max(1u, clog2(x)); }
70+
71+
template<typename T>
72+
static void move(
73+
hls::stream<T> &src,
74+
hls::stream<T> &dst
75+
) {
76+
#pragma HLS pipeline II=1 style=flp
77+
dst.write(src.read());
78+
}
79+
80+
template<typename T>
81+
static void move(
82+
hls::stream<hls::axis<T, 0, 0, 0>> &src,
83+
hls::stream<T> &dst
84+
) {
85+
#pragma HLS pipeline II=1 style=flp
86+
dst.write(src.read().data);
87+
}
88+
89+
template<typename T>
90+
class Payload {
91+
public:
92+
using type = T;
93+
};
94+
template<typename T>
95+
class Payload<hls::axis<T, 0, 0, 0>> {
96+
public:
97+
using type = T;
98+
};
99+
100+
/**
101+
* Computes a checksum over a forwarded stream assumed to carry frames of
102+
* N words further subdivided into K subwords.
103+
* - Subword slicing can be customized typically by using a lambda.
104+
* The provided DefaultSubwordSlicer assumes an `ap_(u)int`-like word
105+
* type with a member `width` and a range-based slicing operator. It
106+
* further assumes a little-endian arrangement of subwords within words
107+
* for the canonical subword stream order.
108+
* - Subwords wider than 23 bits are folded using bitwise XOR across
109+
* slices of 23 bits starting from the LSB.
110+
* - The folded subword values are weighted according to their position
111+
* in the stream relative to the start of frame by a periodic weight
112+
* sequence 1, 2, 3, ...
113+
* - The weighted folded subword values are reduced to a checksum by an
114+
* accumulation module 2^24.
115+
* - A checksum is emitted for each completed frame. It is the concatenation
116+
* of an 8-bit (modulo 256) frame counter and the 24-bit frame checksum.
117+
*/
118+
template<typename T, unsigned K>
119+
class DefaultSubwordSlicer {
120+
static_assert(T::width%K == 0, "Word size must be subword multiple.");
121+
static constexpr unsigned W = T::width/K;
122+
public:
123+
ap_uint<W> operator()(T const &x, unsigned const j) const {
124+
#pragma HLS inline
125+
return x((j+1)*W-1, j*W);
126+
}
127+
};
128+
129+
//---------------------------------------------------------------------------
130+
// Instrumentation Core
131+
template<
132+
unsigned PENDING,
133+
unsigned ILEN,
134+
unsigned OLEN,
135+
unsigned KO,
136+
typename TI,
137+
typename TO
138+
>
139+
void instrument(
140+
hls::stream<TI> &finnix,
141+
hls::stream<TO> &finnox,
142+
ap_uint<32> cfg, // [0] - 0:hold, 1:lfsr; [31:16] - LFSR seed
143+
ap_uint<32> &status, // [0] - timestamp overflow; [1] - timestamp underflow
144+
ap_uint<32> &latency,
145+
ap_uint<32> &interval,
146+
ap_uint<32> &checksum,
147+
ap_uint<32> &min_latency
148+
) {
149+
#pragma HLS pipeline II=1 style=flp
150+
151+
// Timestamp Management State
152+
using clock_t = ap_uint<32>;
153+
static clock_t cnt_clk = 0;
154+
#pragma HLS reset variable=cnt_clk
155+
hls::stream<clock_t> timestamps;
156+
#pragma HLS stream variable=timestamps depth=PENDING
157+
static bool timestamp_ovf = false;
158+
static bool timestamp_unf = false;
159+
#pragma HLS reset variable=timestamp_ovf
160+
#pragma HLS reset variable=timestamp_unf
161+
162+
// Input Feed & Generation
163+
constexpr unsigned LFSR_WIDTH = (TI::width+15)/16 * 16;
164+
static ap_uint<clog2nz(ILEN)> icnt = 0;
165+
static ap_uint<LFSR_WIDTH> lfsr;
166+
#pragma HLS reset variable=icnt
167+
#pragma HLS reset variable=lfsr off
168+
if(!finnix.full()) {
169+
170+
bool const first = icnt == 0;
171+
bool wr;
172+
if(first) {
173+
// Start of new feature map
174+
wr = cfg[0];
175+
for(unsigned i = 0; i < LFSR_WIDTH; i += 16) {
176+
#pragma HLS unroll
177+
lfsr(15+i, i) = cfg(31, 16) ^ (i>>4)*33331;
178+
}
179+
}
180+
else {
181+
// Advance LFSR
182+
wr = true;
183+
for(unsigned i = 0; i < LFSR_WIDTH; i += 16) {
184+
#pragma HLS unroll
185+
lfsr(15+i, i) = (lfsr(15+i, i) >> 1) ^ ap_uint<16>(lfsr[i]? 0 : 0x8805);
186+
}
187+
}
188+
189+
if(wr) {
190+
finnix.write_nb(lfsr);
191+
if(first) timestamp_ovf |= !timestamps.write_nb(cnt_clk);
192+
icnt = icnt == ILEN-1? decltype(icnt)(0) : decltype(icnt)(icnt + 1);
193+
}
194+
}
195+
196+
// Output Tracking
197+
static ap_uint<clog2nz(OLEN)> ocnt = 0;
198+
#pragma HLS reset variable=ocnt
199+
static clock_t ts1 = 0; // last output timestamp
200+
static clock_t last_latency = 0;
201+
static clock_t last_interval = 0;
202+
static clock_t cur_min_latency = ~0;
203+
#pragma HLS reset variable=ts1
204+
#pragma HLS reset variable=last_latency
205+
#pragma HLS reset variable=last_interval
206+
#pragma HLS reset variable=cur_min_latency
207+
208+
static ap_uint<8> pkts = 0;
209+
#pragma HLS reset variable=pkts
210+
static ap_uint< 2> coeff[3];
211+
static ap_uint<24> psum;
212+
static ap_uint<32> last_checksum = 0;
213+
#pragma HLS reset variable=coeff off
214+
#pragma HLS reset variable=psum off
215+
#pragma HLS reset variable=last_checksum
216+
217+
TO oval;
218+
if(finnox.read_nb(oval)) {
219+
// Start of new output feature map
220+
if(ocnt == 0) {
221+
for(unsigned i = 0; i < 3; i++) coeff[i] = i+1;
222+
psum = 0;
223+
}
224+
225+
// Update checksum
226+
for(unsigned j = 0; j < KO; j++) {
227+
#pragma HLS unroll
228+
auto const v0 = DefaultSubwordSlicer<TO, KO>()(oval, j);
229+
constexpr unsigned W = 1 + (decltype(v0)::width-1)/23;
230+
ap_uint<KO*23> v = v0;
231+
ap_uint< 23> w = 0;
232+
for(unsigned k = 0; k < W; k++) w ^= v(23*k+22, 23*k);
233+
psum += (coeff[j%3][1]? (w, ap_uint<1>(0)) : ap_uint<24>(0)) + (coeff[j%3][0]? w : ap_uint<23>(0));
234+
}
235+
236+
// Re-align coefficients
237+
for(unsigned j = 0; j < 3; j++) {
238+
#pragma HLS unroll
239+
ap_uint<3> const cc = coeff[j] + ap_uint<3>(KO%3);
240+
coeff[j] = cc(1, 0) + cc[2];
241+
}
242+
243+
// Track frame position
244+
if(ocnt != OLEN-1) ocnt++;
245+
else {
246+
clock_t ts0;
247+
if(!timestamps.read_nb(ts0)) timestamp_unf = true;
248+
else {
249+
last_latency = cnt_clk - ts0; // completion - start
250+
last_interval = cnt_clk - ts1; // completion - previous completion
251+
cur_min_latency = std::min(cur_min_latency, last_latency);
252+
ts1 = cnt_clk; // mark completion ^
253+
}
254+
ocnt = 0;
255+
256+
last_checksum = (pkts++, psum);
257+
}
258+
}
259+
260+
// Advance Timestamp Counter
261+
cnt_clk++;
262+
263+
// Copy Status Outputs
264+
status = timestamp_ovf | (timestamp_unf << 1);
265+
latency = last_latency;
266+
interval = last_interval;
267+
checksum = last_checksum;
268+
min_latency = cur_min_latency;
269+
270+
} // instrument()
271+
272+
void instrumentation_wrapper(
273+
hls::stream<TI> &finnix,
274+
hls::stream<TO> &finnox,
275+
ap_uint<32> cfg,
276+
ap_uint<32> &status,
277+
ap_uint<32> &latency,
278+
ap_uint<32> &interval,
279+
ap_uint<32> &checksum,
280+
ap_uint<32> &min_latency
281+
) {
282+
#pragma HLS interface axis port=finnix
283+
#pragma HLS interface axis port=finnox
284+
#pragma HLS interface s_axilite bundle=ctrl port=cfg
285+
#pragma HLS interface s_axilite bundle=ctrl port=status
286+
#pragma HLS interface s_axilite bundle=ctrl port=latency
287+
#pragma HLS interface s_axilite bundle=ctrl port=interval
288+
#pragma HLS interface s_axilite bundle=ctrl port=checksum
289+
#pragma HLS interface s_axilite bundle=ctrl port=min_latency
290+
#pragma HLS interface ap_ctrl_none port=return
291+
292+
#pragma HLS dataflow disable_start_propagation
293+
static hls::stream<TI> finnix0;
294+
static hls::stream<Payload<TO>::type> finnox0;
295+
#pragma HLS stream variable=finnix0 depth=2
296+
#pragma HLS stream variable=finnox0 depth=2
297+
298+
// AXI-Stream -> FIFO
299+
move(finnox, finnox0);
300+
301+
// Main
302+
instrument<PENDING, ILEN, OLEN, KO>(finnix0, finnox0, cfg, status, latency, interval, checksum, min_latency);
303+
304+
// FIFO -> AXI-Stream
305+
move(finnix0, finnix);
306+
307+
} // instrumentation_wrapper

0 commit comments

Comments
 (0)