Skip to content

[LoopSchedule][Calyx] PipelineToCalyx Pass Produces Incorrect Results on Simple Designs #4256

Open
@andrewb1999

Description

I have been experimenting with the AffineToPipeline and PipelineToCalyx passes recently and discovered a bug with the PipelineToCalyx pass when testing a simple vectorized add.

The input program in affine (A, B, and O are the top level IO but are left as allocs because of how simulation in native Calyx works for now):

func.func @main() {                                                          
  %A = memref.alloc() : memref<8xi32>   
  %B = memref.alloc() : memref<8xi32>   
  %O = memref.alloc() : memref<8xi32>
  affine.for %arg2 = 0 to 8 step 1 {
    %7 = affine.load %A[%arg2] : memref<8xi32>
    %8 = affine.load %B[%arg2] : memref<8xi32>
    %9 = arith.addi %7, %8 : i32
    affine.store %9, %O[%arg2] : memref<8xi32>
  }
  return
}

I then ran this program through AffineToPipeline and PipelineToCalyx, exported to native calyx, and simulated the design to get the following output:

{
  "cycles": 69,
  "memories": {
    "mem_0": [
      1,
      3,
      7,
      15,
      31,
      63,
      127,
      255
    ],
    "mem_1": [
      1,
      1,
      1,
      1,
      1,
      1,
      1,
      1
    ],
    "mem_2": [
      256,
      2,
      4,
      8,
      16,
      32,
      64,
      128
    ]
  }
}

mem_2 in this case corresponds to %O in this case, the other memory values are set as inputs. Notice that the correct values are there but stored in the wrong address, including the last value wrapping around to address 0.

I believe this is happening because when we lower the pipeline to calyx we do not properly consider the timing of when the loop iter arg value is updated. Namely the same loop iter arg value is used in each stage when it also needs to be pipelined to ensure each stage has the correct value.

This has become a blocking issue for me in exploring improvements to scheduling for Calyx, so I'm willing to put in a decent amount of engineering effort for a fix here. However, I am unsure of where and how it is best to make this fix.

For completeness, here is the program after being lowered to a pipeline:

module {
  func.func @main() {
    %alloc = memref.alloc() : memref<8xi32>
    %alloc_0 = memref.alloc() : memref<8xi32>
    %alloc_1 = memref.alloc() : memref<8xi32>
    %c0 = arith.constant 0 : index
    %c8 = arith.constant 8 : index
    %c1 = arith.constant 1 : index
    pipeline.while II =  1 trip_count =  8 iter_args(%arg0 = %c0) : (index) -> () {
      %0 = arith.cmpi ult, %arg0, %c8 : index
      pipeline.register %0 : i1
    } do {
      %0:3 = pipeline.while.stage start = 0 {
        %1 = memref.load %alloc[%arg0] : memref<8xi32>
        %2 = memref.load %alloc_0[%arg0] : memref<8xi32>
        %3 = arith.addi %arg0, %c1 : index
        pipeline.register %1, %2, %3 : i32, i32, index
      } : i32, i32, index
      pipeline.while.stage start = 1 {
        %1 = arith.addi %0#0, %0#1 : i32
        memref.store %1, %alloc_1[%arg0] : memref<8xi32>
        pipeline.register 
      }
      pipeline.terminator iter_args(%0#2), results() : (index) -> ()
    }
    return
  }
}

and the design after lowering to Calyx:

module attributes {calyx.entrypoint = "main"} {
  calyx.component @main(%clk: i1 {clk}, %reset: i1 {reset}, %go: i1 {go}) -> (%done: i1 {done}) {
    %c0_i32 = hw.constant 0 : i32
    %c8_i32 = hw.constant 8 : i32
    %c1_i32 = hw.constant 1 : i32
    %true = hw.constant true
    %std_slice_2.in, %std_slice_2.out = calyx.std_slice @std_slice_2 : i32, i3
    %std_slice_1.in, %std_slice_1.out = calyx.std_slice @std_slice_1 : i32, i3
    %std_slice_0.in, %std_slice_0.out = calyx.std_slice @std_slice_0 : i32, i3
    %std_add_1.left, %std_add_1.right, %std_add_1.out = calyx.std_add @std_add_1 : i32, i32, i32
    %std_add_0.left, %std_add_0.right, %std_add_0.out = calyx.std_add @std_add_0 : i32, i32, i32
    %std_lt_0.left, %std_lt_0.right, %std_lt_0.out = calyx.std_lt @std_lt_0 : i32, i32, i1
    %mem_2.addr0, %mem_2.write_data, %mem_2.write_en, %mem_2.clk, %mem_2.read_data, %mem_2.done = calyx.memory @mem_2 <[8] x 32> [3] {external = true} : i3, i32, i1, i1, i32, i1
    %mem_1.addr0, %mem_1.write_data, %mem_1.write_en, %mem_1.clk, %mem_1.read_data, %mem_1.done = calyx.memory @mem_1 <[8] x 32> [3] {external = true} : i3, i32, i1, i1, i32, i1
    %mem_0.addr0, %mem_0.write_data, %mem_0.write_en, %mem_0.clk, %mem_0.read_data, %mem_0.done = calyx.memory @mem_0 <[8] x 32> [3] {external = true} : i3, i32, i1, i1, i32, i1
    %stage_0_register_1_reg.in, %stage_0_register_1_reg.write_en, %stage_0_register_1_reg.clk, %stage_0_register_1_reg.reset, %stage_0_register_1_reg.out, %stage_0_register_1_reg.done = calyx.register @stage_0_register_1_reg : i32, i1, i1, i1, i32, i1
    %stage_0_register_0_reg.in, %stage_0_register_0_reg.write_en, %stage_0_register_0_reg.clk, %stage_0_register_0_reg.reset, %stage_0_register_0_reg.out, %stage_0_register_0_reg.done = calyx.register @stage_0_register_0_reg : i32, i1, i1, i1, i32, i1
    %while_0_arg0_reg.in, %while_0_arg0_reg.write_en, %while_0_arg0_reg.clk, %while_0_arg0_reg.reset, %while_0_arg0_reg.out, %while_0_arg0_reg.done = calyx.register @while_0_arg0_reg : i32, i1, i1, i1, i32, i1
    calyx.wires {
      calyx.group @assign_while_0_init_0 {
        calyx.assign %while_0_arg0_reg.in = %c0_i32 : i32
        calyx.assign %while_0_arg0_reg.write_en = %true : i1
        calyx.group_done %while_0_arg0_reg.done : i1
      }
      calyx.comb_group @bb0_0 {
        calyx.assign %std_lt_0.left = %while_0_arg0_reg.out : i32
        calyx.assign %std_lt_0.right = %c8_i32 : i32
      }
      calyx.group @bb0_1 {
        calyx.assign %std_slice_2.in = %while_0_arg0_reg.out : i32
        calyx.assign %mem_0.addr0 = %std_slice_2.out : i3
        calyx.assign %stage_0_register_0_reg.in = %mem_0.read_data : i32
        calyx.assign %stage_0_register_0_reg.write_en = %true : i1
        calyx.group_done %stage_0_register_0_reg.done : i1
      }
      calyx.group @bb0_2 {
        calyx.assign %std_slice_1.in = %while_0_arg0_reg.out : i32
        calyx.assign %mem_1.addr0 = %std_slice_1.out : i3
        calyx.assign %stage_0_register_1_reg.in = %mem_1.read_data : i32
        calyx.assign %stage_0_register_1_reg.write_en = %true : i1
        calyx.group_done %stage_0_register_1_reg.done : i1
      }
      calyx.group @bb0_3 {
        calyx.assign %std_add_0.left = %while_0_arg0_reg.out : i32
        calyx.assign %std_add_0.right = %c1_i32 : i32
        calyx.assign %while_0_arg0_reg.in = %std_add_0.out : i32
        calyx.assign %while_0_arg0_reg.write_en = %true : i1
        calyx.group_done %while_0_arg0_reg.done : i1
      }
      calyx.group @bb0_5 {
        calyx.assign %std_slice_0.in = %while_0_arg0_reg.out : i32
        calyx.assign %mem_2.addr0 = %std_slice_0.out : i3
        calyx.assign %mem_2.write_data = %std_add_1.out : i32
        calyx.assign %mem_2.write_en = %true : i1
        calyx.assign %std_add_1.left = %stage_0_register_0_reg.out : i32
        calyx.assign %std_add_1.right = %stage_0_register_1_reg.out : i32
        calyx.group_done %mem_2.done : i1
      }
    }
    calyx.control {
      calyx.seq {
        calyx.par {
          calyx.enable @assign_while_0_init_0
        }
        calyx.par {
          calyx.enable @bb0_1
          calyx.enable @bb0_2
          calyx.enable @bb0_3
        }
        calyx.while %std_lt_0.out with @bb0_0 {
          calyx.par {
            calyx.enable @bb0_1
            calyx.enable @bb0_2
            calyx.enable @bb0_3
            calyx.enable @bb0_5
          }
        } {bound = 7 : i64}
        calyx.par {
          calyx.enable @bb0_5
        }
      }
    }
  } {toplevel}
}

Metadata

Assignees

No one assigned

    Labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions