Description
I have been experimenting with the AffineToPipeline and PipelineToCalyx passes recently and discovered a bug with the PipelineToCalyx pass when testing a simple vectorized add.
The input program in affine (A, B, and O are the top level IO but are left as allocs because of how simulation in native Calyx works for now):
func.func @main() {
%A = memref.alloc() : memref<8xi32>
%B = memref.alloc() : memref<8xi32>
%O = memref.alloc() : memref<8xi32>
affine.for %arg2 = 0 to 8 step 1 {
%7 = affine.load %A[%arg2] : memref<8xi32>
%8 = affine.load %B[%arg2] : memref<8xi32>
%9 = arith.addi %7, %8 : i32
affine.store %9, %O[%arg2] : memref<8xi32>
}
return
}
I then ran this program through AffineToPipeline and PipelineToCalyx, exported to native calyx, and simulated the design to get the following output:
{
"cycles": 69,
"memories": {
"mem_0": [
1,
3,
7,
15,
31,
63,
127,
255
],
"mem_1": [
1,
1,
1,
1,
1,
1,
1,
1
],
"mem_2": [
256,
2,
4,
8,
16,
32,
64,
128
]
}
}
mem_2
in this case corresponds to %O
in this case, the other memory values are set as inputs. Notice that the correct values are there but stored in the wrong address, including the last value wrapping around to address 0.
I believe this is happening because when we lower the pipeline to calyx we do not properly consider the timing of when the loop iter arg value is updated. Namely the same loop iter arg value is used in each stage when it also needs to be pipelined to ensure each stage has the correct value.
This has become a blocking issue for me in exploring improvements to scheduling for Calyx, so I'm willing to put in a decent amount of engineering effort for a fix here. However, I am unsure of where and how it is best to make this fix.
For completeness, here is the program after being lowered to a pipeline:
module {
func.func @main() {
%alloc = memref.alloc() : memref<8xi32>
%alloc_0 = memref.alloc() : memref<8xi32>
%alloc_1 = memref.alloc() : memref<8xi32>
%c0 = arith.constant 0 : index
%c8 = arith.constant 8 : index
%c1 = arith.constant 1 : index
pipeline.while II = 1 trip_count = 8 iter_args(%arg0 = %c0) : (index) -> () {
%0 = arith.cmpi ult, %arg0, %c8 : index
pipeline.register %0 : i1
} do {
%0:3 = pipeline.while.stage start = 0 {
%1 = memref.load %alloc[%arg0] : memref<8xi32>
%2 = memref.load %alloc_0[%arg0] : memref<8xi32>
%3 = arith.addi %arg0, %c1 : index
pipeline.register %1, %2, %3 : i32, i32, index
} : i32, i32, index
pipeline.while.stage start = 1 {
%1 = arith.addi %0#0, %0#1 : i32
memref.store %1, %alloc_1[%arg0] : memref<8xi32>
pipeline.register
}
pipeline.terminator iter_args(%0#2), results() : (index) -> ()
}
return
}
}
and the design after lowering to Calyx:
module attributes {calyx.entrypoint = "main"} {
calyx.component @main(%clk: i1 {clk}, %reset: i1 {reset}, %go: i1 {go}) -> (%done: i1 {done}) {
%c0_i32 = hw.constant 0 : i32
%c8_i32 = hw.constant 8 : i32
%c1_i32 = hw.constant 1 : i32
%true = hw.constant true
%std_slice_2.in, %std_slice_2.out = calyx.std_slice @std_slice_2 : i32, i3
%std_slice_1.in, %std_slice_1.out = calyx.std_slice @std_slice_1 : i32, i3
%std_slice_0.in, %std_slice_0.out = calyx.std_slice @std_slice_0 : i32, i3
%std_add_1.left, %std_add_1.right, %std_add_1.out = calyx.std_add @std_add_1 : i32, i32, i32
%std_add_0.left, %std_add_0.right, %std_add_0.out = calyx.std_add @std_add_0 : i32, i32, i32
%std_lt_0.left, %std_lt_0.right, %std_lt_0.out = calyx.std_lt @std_lt_0 : i32, i32, i1
%mem_2.addr0, %mem_2.write_data, %mem_2.write_en, %mem_2.clk, %mem_2.read_data, %mem_2.done = calyx.memory @mem_2 <[8] x 32> [3] {external = true} : i3, i32, i1, i1, i32, i1
%mem_1.addr0, %mem_1.write_data, %mem_1.write_en, %mem_1.clk, %mem_1.read_data, %mem_1.done = calyx.memory @mem_1 <[8] x 32> [3] {external = true} : i3, i32, i1, i1, i32, i1
%mem_0.addr0, %mem_0.write_data, %mem_0.write_en, %mem_0.clk, %mem_0.read_data, %mem_0.done = calyx.memory @mem_0 <[8] x 32> [3] {external = true} : i3, i32, i1, i1, i32, i1
%stage_0_register_1_reg.in, %stage_0_register_1_reg.write_en, %stage_0_register_1_reg.clk, %stage_0_register_1_reg.reset, %stage_0_register_1_reg.out, %stage_0_register_1_reg.done = calyx.register @stage_0_register_1_reg : i32, i1, i1, i1, i32, i1
%stage_0_register_0_reg.in, %stage_0_register_0_reg.write_en, %stage_0_register_0_reg.clk, %stage_0_register_0_reg.reset, %stage_0_register_0_reg.out, %stage_0_register_0_reg.done = calyx.register @stage_0_register_0_reg : i32, i1, i1, i1, i32, i1
%while_0_arg0_reg.in, %while_0_arg0_reg.write_en, %while_0_arg0_reg.clk, %while_0_arg0_reg.reset, %while_0_arg0_reg.out, %while_0_arg0_reg.done = calyx.register @while_0_arg0_reg : i32, i1, i1, i1, i32, i1
calyx.wires {
calyx.group @assign_while_0_init_0 {
calyx.assign %while_0_arg0_reg.in = %c0_i32 : i32
calyx.assign %while_0_arg0_reg.write_en = %true : i1
calyx.group_done %while_0_arg0_reg.done : i1
}
calyx.comb_group @bb0_0 {
calyx.assign %std_lt_0.left = %while_0_arg0_reg.out : i32
calyx.assign %std_lt_0.right = %c8_i32 : i32
}
calyx.group @bb0_1 {
calyx.assign %std_slice_2.in = %while_0_arg0_reg.out : i32
calyx.assign %mem_0.addr0 = %std_slice_2.out : i3
calyx.assign %stage_0_register_0_reg.in = %mem_0.read_data : i32
calyx.assign %stage_0_register_0_reg.write_en = %true : i1
calyx.group_done %stage_0_register_0_reg.done : i1
}
calyx.group @bb0_2 {
calyx.assign %std_slice_1.in = %while_0_arg0_reg.out : i32
calyx.assign %mem_1.addr0 = %std_slice_1.out : i3
calyx.assign %stage_0_register_1_reg.in = %mem_1.read_data : i32
calyx.assign %stage_0_register_1_reg.write_en = %true : i1
calyx.group_done %stage_0_register_1_reg.done : i1
}
calyx.group @bb0_3 {
calyx.assign %std_add_0.left = %while_0_arg0_reg.out : i32
calyx.assign %std_add_0.right = %c1_i32 : i32
calyx.assign %while_0_arg0_reg.in = %std_add_0.out : i32
calyx.assign %while_0_arg0_reg.write_en = %true : i1
calyx.group_done %while_0_arg0_reg.done : i1
}
calyx.group @bb0_5 {
calyx.assign %std_slice_0.in = %while_0_arg0_reg.out : i32
calyx.assign %mem_2.addr0 = %std_slice_0.out : i3
calyx.assign %mem_2.write_data = %std_add_1.out : i32
calyx.assign %mem_2.write_en = %true : i1
calyx.assign %std_add_1.left = %stage_0_register_0_reg.out : i32
calyx.assign %std_add_1.right = %stage_0_register_1_reg.out : i32
calyx.group_done %mem_2.done : i1
}
}
calyx.control {
calyx.seq {
calyx.par {
calyx.enable @assign_while_0_init_0
}
calyx.par {
calyx.enable @bb0_1
calyx.enable @bb0_2
calyx.enable @bb0_3
}
calyx.while %std_lt_0.out with @bb0_0 {
calyx.par {
calyx.enable @bb0_1
calyx.enable @bb0_2
calyx.enable @bb0_3
calyx.enable @bb0_5
}
} {bound = 7 : i64}
calyx.par {
calyx.enable @bb0_5
}
}
}
} {toplevel}
}