Skip to content

Commit 9b1ef51

Browse files
authored
Merge pull request #212 from eki-project/feature/instrumentation_avg
[Instrumentation] Measure avg. performance
2 parents d65f70d + e5e8a13 commit 9b1ef51

6 files changed

Lines changed: 110 additions & 8 deletions

File tree

custom_hls/instrumentation.template.cpp

Lines changed: 64 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@
6060
constexpr unsigned ILEN = @ILEN@; // Input words per IFM
6161
constexpr unsigned OLEN = @OLEN@; // Output words per OFM
6262
constexpr unsigned KO = @KO@; // Subwords within OFM transaction word
63+
constexpr unsigned AVG_N = @AVG_N@; // Max frames in averaging window
6364
using TI = @TI@; // IFM transaction word
6465
using TO = @TO@; // OFM transaction word
6566

@@ -133,6 +134,7 @@
133134
unsigned ILEN,
134135
unsigned OLEN,
135136
unsigned KO,
137+
unsigned AVG_N,
136138
typename TI,
137139
typename TO
138140
>
@@ -141,11 +143,14 @@
141143
hls::stream<TO> &finnox,
142144
ap_uint<32> cfg, // [0] - 0:hold, 1:lfsr; [31:1] - minimum interval (cycles) between IFM starts
143145
ap_uint<32> seed, // [31:16] - LFSR seed (only upper 16 bits used)
146+
ap_uint<32> avg_n, // [31:0] - averaging window size (1..AVG_N frames)
144147
ap_uint<32> &status, // [0] - timestamp overflow; [1] - timestamp underflow
145148
ap_uint<32> &latency,
146149
ap_uint<32> &interval,
147150
ap_uint<32> &checksum,
148-
ap_uint<32> &min_latency
151+
ap_uint<32> &min_latency,
152+
ap_uint<32> &avg_latency,
153+
ap_uint<32> &avg_interval
149154
) {
150155
#pragma HLS pipeline II=1 style=flp
151156

@@ -219,6 +224,26 @@
219224
#pragma HLS reset variable=last_interval
220225
#pragma HLS reset variable=cur_min_latency
221226

227+
// Sliding-Window Averaging State
228+
static ap_uint<clog2nz(AVG_N)> avg_head = 0; // write pointer in circular buffer
229+
static ap_uint<clog2nz(AVG_N+1)> avg_fill = 0; // number of valid entries (0..AVG_N)
230+
static clock_t lat_buf[AVG_N];
231+
static clock_t int_buf[AVG_N];
232+
static ap_uint<64> lat_sum = 0;
233+
static ap_uint<64> int_sum = 0;
234+
static clock_t last_avg_latency = 0;
235+
static clock_t last_avg_interval = 0;
236+
static ap_uint<32> prev_avg_n = 0;
237+
#pragma HLS reset variable=avg_head
238+
#pragma HLS reset variable=avg_fill
239+
#pragma HLS reset variable=lat_buf off
240+
#pragma HLS reset variable=int_buf off
241+
#pragma HLS reset variable=lat_sum
242+
#pragma HLS reset variable=int_sum
243+
#pragma HLS reset variable=last_avg_latency
244+
#pragma HLS reset variable=last_avg_interval
245+
#pragma HLS reset variable=prev_avg_n
246+
222247
static ap_uint<8> pkts = 0;
223248
#pragma HLS reset variable=pkts
224249
static ap_uint< 2> coeff[3];
@@ -264,6 +289,33 @@
264289
last_interval = cnt_clk - ts1; // completion - previous completion
265290
cur_min_latency = std::min(cur_min_latency, last_latency);
266291
ts1 = cnt_clk; // mark completion ^
292+
293+
// Sliding-window average update
294+
// TODO: II=1 but depth is ~70 cycles, can we optimize this?
295+
ap_uint<32> win = (avg_n == 0 || avg_n > AVG_N) ? ap_uint<32>(AVG_N) : avg_n;
296+
if(prev_avg_n != win) {
297+
avg_head = 0;
298+
avg_fill = 0;
299+
lat_sum = 0;
300+
int_sum = 0;
301+
prev_avg_n = win;
302+
}
303+
clock_t old_lat = lat_buf[avg_head];
304+
clock_t old_int = int_buf[avg_head];
305+
lat_buf[avg_head] = last_latency;
306+
int_buf[avg_head] = last_interval;
307+
if(avg_fill < win) {
308+
lat_sum += last_latency;
309+
int_sum += last_interval;
310+
avg_fill++;
311+
} else {
312+
lat_sum = lat_sum + last_latency - old_lat;
313+
int_sum = int_sum + last_interval - old_int;
314+
}
315+
avg_head++;
316+
if(avg_head >= ap_uint<clog2nz(AVG_N)+1>(win)) avg_head = 0;
317+
last_avg_latency = lat_sum / avg_fill;
318+
last_avg_interval = int_sum / avg_fill;
267319
}
268320
ocnt = 0;
269321

@@ -279,7 +331,9 @@
279331
latency = last_latency;
280332
interval = last_interval;
281333
checksum = last_checksum;
282-
min_latency = cur_min_latency;
334+
min_latency = cur_min_latency;
335+
avg_latency = last_avg_latency;
336+
avg_interval = last_avg_interval;
283337

284338
} // instrument()
285339

@@ -288,21 +342,27 @@
288342
hls::stream<TO> &finnox,
289343
ap_uint<32> cfg,
290344
ap_uint<32> seed,
345+
ap_uint<32> avg_n,
291346
ap_uint<32> &status,
292347
ap_uint<32> &latency,
293348
ap_uint<32> &interval,
294349
ap_uint<32> &checksum,
295-
ap_uint<32> &min_latency
350+
ap_uint<32> &min_latency,
351+
ap_uint<32> &avg_latency,
352+
ap_uint<32> &avg_interval
296353
) {
297354
#pragma HLS interface axis port=finnix
298355
#pragma HLS interface axis port=finnox
299356
#pragma HLS interface s_axilite bundle=ctrl port=cfg
300357
#pragma HLS interface s_axilite bundle=ctrl port=seed
358+
#pragma HLS interface s_axilite bundle=ctrl port=avg_n
301359
#pragma HLS interface s_axilite bundle=ctrl port=status
302360
#pragma HLS interface s_axilite bundle=ctrl port=latency
303361
#pragma HLS interface s_axilite bundle=ctrl port=interval
304362
#pragma HLS interface s_axilite bundle=ctrl port=checksum
305363
#pragma HLS interface s_axilite bundle=ctrl port=min_latency
364+
#pragma HLS interface s_axilite bundle=ctrl port=avg_latency
365+
#pragma HLS interface s_axilite bundle=ctrl port=avg_interval
306366
#pragma HLS interface ap_ctrl_none port=return
307367

308368
#pragma HLS dataflow disable_start_propagation
@@ -315,7 +375,7 @@
315375
move(finnox, finnox0);
316376

317377
// Main
318-
instrument<PENDING, ILEN, OLEN, KO>(finnix0, finnox0, cfg, seed, status, latency, interval, checksum, min_latency);
378+
instrument<PENDING, ILEN, OLEN, KO, AVG_N>(finnix0, finnox0, cfg, seed, avg_n, status, latency, interval, checksum, min_latency, avg_latency, avg_interval);
319379

320380
// FIFO -> AXI-Stream
321381
move(finnix0, finnix);

src/finn/builder/build_dataflow_config.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -538,6 +538,10 @@ def _fix_path(p: Path | None) -> Path | None:
538538
#: If enable_instrumentation is True, one can disable the DMA with this flag
539539
instrumentation_no_dma: Optional[bool] = False
540540

541+
#: (Only relevant if enable_instrumentation is True) Size of the averaging window
542+
#: (number of frames) used by the instrumentation wrapper for throughput measurement.
543+
instrumentation_avg_n: int = 64
544+
541545
#: Whether pdb postmortem debugging will be launched when the build fails.
542546
enable_build_pdb_debug: bool = False
543547

src/finn/builder/build_dataflow_steps.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1515,6 +1515,7 @@ def step_synthesize_bitfile(model: ModelWrapper, cfg: DataflowBuildConfig):
15151515
cfg.enable_hw_debug,
15161516
cfg.enable_instrumentation,
15171517
cfg.instrumentation_no_dma,
1518+
cfg.instrumentation_avg_n,
15181519
cfg.live_fifo_sizing,
15191520
partition_model_dir=partition_model_dir,
15201521
)

src/finn/templates/python_driver/driver.py

Lines changed: 30 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -601,7 +601,7 @@ def reset_accelerator(self):
601601
offset=self.ip_dict["axi_gpio_0"]["registers"]["GPIO_DATA"]["address_offset"], value=0
602602
)
603603

604-
def start_accelerator(self, throttle_interval=0):
604+
def start_accelerator(self, throttle_interval=0, avg_window_size=64):
605605
"""
606606
Start the accelerator. Input is throttled to the specified interval (in cycles)
607607
by pausing after each FM transmission. A throttle_interval of 0 means no throttling.
@@ -610,6 +610,10 @@ def start_accelerator(self, throttle_interval=0):
610610
lfsr_seed = (self.seed << 16) & 0xFFFF0000 # upper 16 bits
611611
self.instrumentation_write("seed", lfsr_seed)
612612

613+
# Set average measurement window size (in frames),
614+
# maximum is configured in build config, default value = 64
615+
self.instrumentation_write("avg_n", avg_window_size)
616+
613617
# Start operation
614618
self.instrumentation_write("cfg", (throttle_interval << 1) | 1) # bit 0 = start
615619

@@ -624,6 +628,8 @@ def observe_instrumentation(self, debug_print=True):
624628
min_latency = self.instrumentation_read("min_latency")
625629
latency = self.instrumentation_read("latency")
626630
interval = self.instrumentation_read("interval")
631+
avg_latency = self.instrumentation_read("avg_latency")
632+
avg_interval = self.instrumentation_read("avg_interval")
627633

628634
frame = (chksum_reg >> 24) & 0x000000FF
629635
checksum = chksum_reg & 0x00FFFFFF
@@ -643,9 +649,21 @@ def observe_instrumentation(self, debug_print=True):
643649
print("Min Latency (cycles): %d" % min_latency)
644650
print("Latency (cycles): %d" % latency)
645651
print("Interval (cycles): %d" % interval)
652+
print("Average Latency (cycles): %d" % avg_latency)
653+
print("Average Interval (cycles): %d" % avg_interval)
646654
print("----------------------------")
647655

648-
return (overflow_err, underflow_err, frame, checksum, min_latency, latency, interval)
656+
return (
657+
overflow_err,
658+
underflow_err,
659+
frame,
660+
checksum,
661+
min_latency,
662+
latency,
663+
interval,
664+
avg_latency,
665+
avg_interval,
666+
)
649667

650668
def experiment_instrumentation(self, *args, **kwargs):
651669
"""Run instrumentation experiment and save report."""
@@ -668,6 +686,8 @@ def experiment_instrumentation(self, *args, **kwargs):
668686
min_latency,
669687
latency,
670688
interval,
689+
avg_latency,
690+
avg_interval,
671691
) = self.observe_instrumentation()
672692

673693
# write report to file
@@ -677,12 +697,20 @@ def experiment_instrumentation(self, *args, **kwargs):
677697
"min_latency_cycles": min_latency,
678698
"latency_cycles": latency,
679699
"interval_cycles": interval,
700+
"avg_latency_cycles": avg_latency,
701+
"avg_interval_cycles": avg_interval,
680702
"frequency_mhz": round(self.fclk_mhz_actual),
681703
"min_latency_ms": round(min_latency * (1 / (self.fclk_mhz_actual * 1e6)) * 1e3, 6),
682704
"latency_ms": round(latency * (1 / (self.fclk_mhz_actual * 1e6)) * 1e3, 6),
705+
"avg_latency_ms": round(avg_latency * (1 / (self.fclk_mhz_actual * 1e6)) * 1e3, 6),
683706
"throughput_fps": (
684707
round(1 / (interval * (1 / (self.fclk_mhz_actual * 1e6)))) if interval != 0 else 0
685708
),
709+
"avg_throughput_fps": (
710+
round(1 / (avg_interval * (1 / (self.fclk_mhz_actual * 1e6))))
711+
if avg_interval != 0
712+
else 0
713+
),
686714
"min_pipeline_depth": round(min_latency / interval, 2) if interval != 0 else 0,
687715
"pipeline_depth": round(latency / interval, 2) if interval != 0 else 0,
688716
}

src/finn/transformation/fpgadataflow/instrumentation.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,12 +43,14 @@ def __init__(
4343
self,
4444
fpga_part,
4545
clk_period_ns,
46+
avg_n=64,
4647
format="ip", # "ip" for Vivado (Zynq) or "xo" for Vitis (Alveo/Versal)
4748
):
4849
"""Initialize instrumentation IP generation with FPGA part and clock settings."""
4950
super().__init__()
5051
self.fpga_part = fpga_part
5152
self.clk_period_ns = clk_period_ns
53+
self.avg_n = avg_n
5254
self.format = format
5355

5456
def apply(self, model):
@@ -86,6 +88,7 @@ def apply(self, model):
8688
) as f:
8789
instrwrp_cpp = f.read()
8890
instrwrp_cpp = instrwrp_cpp.replace("@PENDING@", str(pending))
91+
instrwrp_cpp = instrwrp_cpp.replace("@AVG_N@", str(self.avg_n))
8992
instrwrp_cpp = instrwrp_cpp.replace("@ILEN@", str(ilen))
9093
instrwrp_cpp = instrwrp_cpp.replace("@OLEN@", str(olen))
9194
instrwrp_cpp = instrwrp_cpp.replace("@TI@", str(ti))

src/finn/transformation/fpgadataflow/make_zynq_proj.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -625,6 +625,7 @@ def __init__(
625625
enable_debug=False,
626626
enable_instrumentation=False,
627627
instrumentation_no_dma=False,
628+
instrumentation_avg_n=64,
628629
live_fifo_sizing=False,
629630
partition_model_dir=None,
630631
):
@@ -637,6 +638,7 @@ def __init__(
637638
self.enable_debug = enable_debug
638639
self.enable_instrumentation = enable_instrumentation
639640
self.instrumentation_no_dma = instrumentation_no_dma
641+
self.instrumentation_avg_n = instrumentation_avg_n
640642
self.live_fifo_sizing = live_fifo_sizing
641643
self.partition_model_dir = partition_model_dir
642644

@@ -652,14 +654,18 @@ def apply(self, model):
652654
if self.enable_instrumentation:
653655
if self.instrumentation_no_dma is True or self.live_fifo_sizing is True:
654656
prep_transforms = [
655-
GenerateInstrumentationIP(self.fpga_part, self.period_ns),
657+
GenerateInstrumentationIP(
658+
self.fpga_part, self.period_ns, self.instrumentation_avg_n
659+
),
656660
Floorplan(),
657661
CreateDataflowPartition(partition_model_dir=self.partition_model_dir),
658662
]
659663
else:
660664
# DMA & Instrumentation Wrapper Case
661665
prep_transforms = [
662-
GenerateInstrumentationIP(self.fpga_part, self.period_ns),
666+
GenerateInstrumentationIP(
667+
self.fpga_part, self.period_ns, self.instrumentation_avg_n
668+
),
663669
InsertIODMA(self.axi_port_width),
664670
InsertDWC(),
665671
SpecializeLayers(self.fpga_part),

0 commit comments

Comments
 (0)