amd
diff --git a/‎examples/rms_norm/air_project/aie.--peano ‎
Lines changed: 196 additions & 0 deletions b/‎examples/rms_norm/air_project/aie.--peano ‎
Lines changed: 196 additions & 0 deletions
diff --git a/‎examples/rms_norm/air_project/airinput.mlir‎
Lines changed: 55 additions & 0 deletions b/‎examples/rms_norm/air_project/airinput.mlir‎
Lines changed: 55 additions & 0 deletions
diff --git a/‎examples/rms_norm/air_project/asm_air_output.mlir‎
Lines changed: 55 additions & 0 deletions b/‎examples/rms_norm/air_project/asm_air_output.mlir‎
Lines changed: 55 additions & 0 deletions
@@ -0,0 +1,196 @@
+#loop_annotation = #llvm.loop_annotation<mustProgress = true>
+module {
+  aie.device(npu2) @rms_norm_kernel_0 {
+    %shim_noc_tile_0_0 = aie.tile(0, 0)
+    %shim_noc_tile_1_0 = aie.tile(1, 0)
+    %mem_tile_0_1 = aie.tile(0, 1)
+    %mem_tile_1_1 = aie.tile(1, 1)
+    %tile_0_2 = aie.tile(0, 2)
+    %tile_0_3 = aie.tile(0, 3)
+    %lock_1_1 = aie.lock(%mem_tile_1_1, 1) {init = 1 : i32}
+    %lock_1_1_0 = aie.lock(%mem_tile_1_1, 0) {init = 0 : i32}
+    %buf5 = aie.buffer(%mem_tile_0_1) {sym_name = "buf5"} : memref<2x64xbf16, 1 : i32> 
+    %buf4 = aie.buffer(%mem_tile_1_1) {sym_name = "buf4"} : memref<2x64xbf16, 1> 
+    %buf3 = aie.buffer(%tile_0_3) {sym_name = "buf3"} : memref<1xf32, 2> 
+    %buf2 = aie.buffer(%tile_0_3) {sym_name = "buf2"} : memref<1x64xbf16, 2> 
+    %buf1 = aie.buffer(%tile_0_2) {sym_name = "buf1"} : memref<1xf32, 2> 
+    %buf0 = aie.buffer(%tile_0_2) {sym_name = "buf0"} : memref<1x64xbf16, 2> 
+    memref.global "public" @__air_herd_arg_1 : memref<2x64xbf16, 1 : i32>
+    %core_0_3 = aie.core(%tile_0_3) {
+      %c64 = arith.constant 64 : index
+      %cst = arith.constant 0.000000e+00 : f32
+      %cst_1 = arith.constant 6.400000e+01 : f32
+      %cst_2 = arith.constant 9.99999974E-6 : f32
+      %c1 = arith.constant 1 : index
+      %c0 = arith.constant 0 : index
+      cf.br ^bb1
+    ^bb1:  // 2 preds: ^bb0, ^bb2
+      %0 = memref.get_global @__air_herd_arg_1 : memref<2x64xbf16, 1 : i32>
+      cf.br ^bb2
+    ^bb2:  // pred: ^bb1
+      %subview = memref.subview %0[%c1, 0] [1, 64] [1, 1] : memref<2x64xbf16, 1 : i32> to memref<1x64xbf16, strided<[64, 1], offset: ?>, 1 : i32>
+      memref.store %cst, %buf3[%c0] : memref<1xf32, 2>
+      scf.for %arg0 = %c0 to %c64 step %c1 {
+        %1 = memref.load %subview[%c0, %arg0] : memref<1x64xbf16, strided<[64, 1], offset: ?>, 1 : i32>
+        %2 = memref.load %buf3[%c0] : memref<1xf32, 2>
+        %3 = arith.extf %1 : bf16 to f32
+        %4 = arith.mulf %3, %3 : f32
+        %5 = arith.addf %4, %2 : f32
+        memref.store %5, %buf3[%c0] : memref<1xf32, 2>
+      } {loop_annotation = #loop_annotation}
+      scf.for %arg0 = %c0 to %c64 step %c1 {
+        %1 = memref.load %subview[%c0, %arg0] : memref<1x64xbf16, strided<[64, 1], offset: ?>, 1 : i32>
+        %2 = memref.load %buf3[%c0] : memref<1xf32, 2>
+        %3 = arith.divf %2, %cst_1 : f32
+        %4 = arith.addf %3, %cst_2 : f32
+        %5 = math.rsqrt %4 : f32
+        %6 = arith.extf %1 : bf16 to f32
+        %7 = arith.mulf %6, %5 : f32
+        %8 = arith.truncf %7 : f32 to bf16
+        memref.store %8, %buf2[%c0, %arg0] : memref<1x64xbf16, 2>
+      } {loop_annotation = #loop_annotation}
+      cf.br ^bb1
+    }
+    memref.global "public" @__air_herd_arg : memref<2x64xbf16, 1 : i32>
+    %core_0_2 = aie.core(%tile_0_2) {
+      %c64 = arith.constant 64 : index
+      %cst = arith.constant 0.000000e+00 : f32
+      %cst_1 = arith.constant 6.400000e+01 : f32
+      %cst_2 = arith.constant 9.99999974E-6 : f32
+      %c1 = arith.constant 1 : index
+      %c0 = arith.constant 0 : index
+      cf.br ^bb1
+    ^bb1:  // 2 preds: ^bb0, ^bb2
+      %0 = memref.get_global @__air_herd_arg : memref<2x64xbf16, 1 : i32>
+      cf.br ^bb2
+    ^bb2:  // pred: ^bb1
+      %subview = memref.subview %0[%c0, 0] [1, 64] [1, 1] : memref<2x64xbf16, 1 : i32> to memref<1x64xbf16, strided<[64, 1], offset: ?>, 1 : i32>
+      memref.store %cst, %buf1[%c0] : memref<1xf32, 2>
+      scf.for %arg0 = %c0 to %c64 step %c1 {
+        %1 = memref.load %subview[%c0, %arg0] : memref<1x64xbf16, strided<[64, 1], offset: ?>, 1 : i32>
+        %2 = memref.load %buf1[%c0] : memref<1xf32, 2>
+        %3 = arith.extf %1 : bf16 to f32
+        %4 = arith.mulf %3, %3 : f32
+        %5 = arith.addf %4, %2 : f32
+        memref.store %5, %buf1[%c0] : memref<1xf32, 2>
+      } {loop_annotation = #loop_annotation}
+      scf.for %arg0 = %c0 to %c64 step %c1 {
+        %1 = memref.load %subview[%c0, %arg0] : memref<1x64xbf16, strided<[64, 1], offset: ?>, 1 : i32>
+        %2 = memref.load %buf1[%c0] : memref<1xf32, 2>
+        %3 = arith.divf %2, %cst_1 : f32
+        %4 = arith.addf %3, %cst_2 : f32
+        %5 = math.rsqrt %4 : f32
+        %6 = arith.extf %1 : bf16 to f32
+        %7 = arith.mulf %6, %5 : f32
+        %8 = arith.truncf %7 : f32 to bf16
+        memref.store %8, %buf0[%c0, %arg0] : memref<1x64xbf16, 2>
+      } {loop_annotation = #loop_annotation}
+      cf.br ^bb1
+    }
+    air.channel @channel_0 []
+    air.channel @channel_1 []
+    aie.flow(%shim_noc_tile_0_0, DMA : 0, %mem_tile_0_1, DMA : 0)
+    aie.flow(%mem_tile_1_1, DMA : 0, %shim_noc_tile_1_0, DMA : 0)
+    %memtile_dma_1_1 = aie.memtile_dma(%mem_tile_1_1) {
+      %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb2)
+    ^bb1:  // 2 preds: ^bb0, ^bb1
+      aie.use_lock(%lock_1_1_0, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf4 : memref<2x64xbf16, 1>, 0, 128) {task_id = 0 : i32}
+      aie.use_lock(%lock_1_1, Release, 1)
+      aie.next_bd ^bb1
+    ^bb2:  // pred: ^bb0
+      aie.end
+    }
+    aie.shim_dma_allocation @air_channel_1(%shim_noc_tile_1_0, S2MM, 0)
+    aie.shim_dma_allocation @air_channel_0(%shim_noc_tile_0_0, MM2S, 0)
+  } {dlti.dl_spec = #dlti.dl_spec<index = 32 : i64>}
+  airrt.module_metadata{
+    airrt.segment_metadata attributes {dma_allocations = [{channel = 2 : i64, col = 0 : i64, id = 3 : i64, location = 0 : i64, row = -1 : i64}], sym_name = "rms_norm_kernel_0"}{
+      airrt.herd_metadata {dma_allocations = [], loc_x = 0 : i64, loc_y = 2 : i64, size_x = 1 : i64, size_y = 2 : i64, sym_name = "herd_0"}
+    }
+  }
+  air.channel @channel_0 []
+  air.channel @channel_1 []
+  func.func @rms_norm_kernel(%arg0: memref<*xbf16> {tt.divisibility = 16 : i32}, %arg1: memref<*xbf16> {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32) {
+    %c1 = arith.constant 1 : index
+    %c16 = arith.constant 16 : index
+    %0 = air.launch async (%arg9, %arg10, %arg11) in (%arg12=%c16, %arg13=%c1, %arg14=%c1) args(%arg15=%arg0, %arg16=%arg1) : memref<*xbf16>, memref<*xbf16> attributes {id = 1 : i32} {
+      %c0 = arith.constant 0 : index
+      %c64 = arith.constant 64 : index
+      %c2 = arith.constant 2 : index
+      %c1_0 = arith.constant 1 : index
+      %c128 = arith.constant 128 : index
+      %1 = arith.muli %arg10, %c128 : index
+      %2 = air.channel.put async  @channel_0[] (%arg15[%c0, %1] [%c2, %c64] [%c64, %c1_0]) {id = 1 : i32, metadataArray = [{base = "air_channel_0", index = 0 : i32}]} : (memref<*xbf16>)
+      %3 = air.channel.get async  @channel_1[] (%arg16[%c0, %1] [%c2, %c64] [%c64, %c1_0]) {id = 2 : i32, metadataArray = [{base = "air_channel_1", index = 0 : i32}]} : (memref<*xbf16>)
+      %4 = air.segment @rms_norm_kernel_0 async  attributes {id = 2 : i32, x_loc = 0 : i64, x_size = 8 : i64, y_loc = 2 : i64, y_size = 6 : i64} {
+        %c2_1 = arith.constant 2 : index
+        %c1_2 = arith.constant 1 : index
+        %async_token, %results = air.execute -> (memref<2x64xbf16, 1 : i32>) {
+          %alloc = memref.alloc() : memref<2x64xbf16, 1 : i32>
+          air.execute_terminator %alloc : memref<2x64xbf16, 1 : i32>
+        }
+        %5 = air.channel.get async [%async_token]  @channel_0[] (%results[] [] []) {id = 3 : i32} : (memref<2x64xbf16, 1 : i32>)
+        %async_token_3, %results_4 = air.execute -> (memref<2x64xbf16, 1>) {
+          %alloc = memref.alloc() : memref<2x64xbf16, 1>
+          air.execute_terminator %alloc : memref<2x64xbf16, 1>
+        }
+        %6 = air.herd @herd_0 async [%5]  tile (%arg17, %arg18) in (%arg19=%c1_2, %arg20=%c2_1) args(%arg21=%results) : memref<2x64xbf16, 1 : i32> attributes {id = 3 : i32, x_loc = 0 : i64, y_loc = 2 : i64} {
+          %cst = arith.constant 9.99999974E-6 : f32
+          %cst_6 = arith.constant 6.400000e+01 : f32
+          %cst_7 = arith.constant 0.000000e+00 : f32
+          %c0_8 = arith.constant 0 : index
+          %c64_9 = arith.constant 64 : index
+          %c1_10 = arith.constant 1 : index
+          %subview = memref.subview %arg21[%arg18, 0] [1, 64] [1, 1] : memref<2x64xbf16, 1 : i32> to memref<1x64xbf16, strided<[64, 1], offset: ?>, 1 : i32>
+          %async_token_11, %results_12 = air.execute -> (memref<1xf32, 2>) {
+            %alloc = memref.alloc() : memref<1xf32, 2>
+            air.execute_terminator %alloc : memref<1xf32, 2>
+          }
+          %async_token_13 = air.execute [%async_token_11] {
+            memref.store %cst_7, %results_12[%c0_8] : memref<1xf32, 2>
+          }
+          %async_token_14 = air.execute [%async_token_13] {
+            scf.for %arg22 = %c0_8 to %c64_9 step %c1_10 {
+              %8 = memref.load %subview[%c0_8, %arg22] : memref<1x64xbf16, strided<[64, 1], offset: ?>, 1 : i32>
+              %9 = memref.load %results_12[%c0_8] : memref<1xf32, 2>
+              %10 = arith.extf %8 : bf16 to f32
+              %11 = arith.mulf %10, %10 : f32
+              %12 = arith.addf %11, %9 : f32
+              memref.store %12, %results_12[%c0_8] : memref<1xf32, 2>
+            }
+          }
+          %async_token_15, %results_16 = air.execute -> (memref<1x64xbf16, 2>) {
+            %alloc = memref.alloc() : memref<1x64xbf16, 2>
+            air.execute_terminator %alloc : memref<1x64xbf16, 2>
+          }
+          %async_token_17 = air.execute [%async_token_15, %async_token_14] {
+            scf.for %arg22 = %c0_8 to %c64_9 step %c1_10 {
+              %8 = memref.load %subview[%c0_8, %arg22] : memref<1x64xbf16, strided<[64, 1], offset: ?>, 1 : i32>
+              %9 = memref.load %results_12[%c0_8] : memref<1xf32, 2>
+              %10 = arith.divf %9, %cst_6 : f32
+              %11 = arith.addf %10, %cst : f32
+              %12 = math.rsqrt %11 : f32
+              %13 = arith.extf %8 : bf16 to f32
+              %14 = arith.mulf %13, %12 : f32
+              %15 = arith.truncf %14 : f32 to bf16
+              memref.store %15, %results_16[%c0_8, %arg22] : memref<1x64xbf16, 2>
+            }
+          }
+          %async_token_18 = air.execute [%async_token_17] {
+            memref.dealloc %results_12 : memref<1xf32, 2>
+          }
+          %async_token_19 = air.execute [%async_token_17] {
+            memref.dealloc %results_16 : memref<1x64xbf16, 2>
+          }
+        }
+        %7 = air.channel.put async [%async_token_3]  @channel_1[] (%results_4[] [] []) {id = 4 : i32} : (memref<2x64xbf16, 1>)
+        %async_token_5 = air.execute [%7] {
+          memref.dealloc %results_4 : memref<2x64xbf16, 1>
+        }
+        air.wait_all [%6, %async_token_5]  {air.segment_end}
+      }
+    }
+    return
+  }
+}
@@ -0,0 +1,55 @@
+#map = affine_map<(d0, d1) -> (d0, d1)>
+#map1 = affine_map<(d0, d1) -> (d0)>
+module {
+  func.func @rms_norm_kernel(%arg0: memref<*xbf16> {tt.divisibility = 16 : i32}, %arg1: memref<*xbf16> {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32) {
+    %c1 = arith.constant 1 : index
+    %c16 = arith.constant 16 : index
+    air.launch (%arg9, %arg10, %arg11) in (%arg12=%c16, %arg13=%c1, %arg14=%c1) args(%arg15=%arg0, %arg16=%arg1) : memref<*xbf16>, memref<*xbf16> {
+      air.segment @rms_norm_kernel_0  args(%arg17=%arg10, %arg18=%arg15, %arg19=%arg16) : index, memref<*xbf16>, memref<*xbf16> {
+        %c0 = arith.constant 0 : index
+        %c64 = arith.constant 64 : index
+        %c2 = arith.constant 2 : index
+        %c1_0 = arith.constant 1 : index
+        %c128 = arith.constant 128 : index
+        %0 = arith.muli %arg17, %c128 : index
+        %alloc = memref.alloc() : memref<2x64xbf16, 1 : i32>
+        air.dma_memcpy_nd (%alloc[] [] [], %arg18[%c0, %0] [%c2, %c64] [%c64, %c1_0]) {id = 1 : i32} : (memref<2x64xbf16, 1 : i32>, memref<*xbf16>)
+        %alloc_1 = memref.alloc() : memref<2x64xbf16, 1>
+        air.herd @herd_0  tile (%arg20, %arg21) in (%arg22=%c2, %arg23=%c1_0) args(%arg24=%alloc, %arg25=%alloc_1) : memref<2x64xbf16, 1 : i32>, memref<2x64xbf16, 1> {
+          %c64_2 = arith.constant 64 : index
+          %c1_3 = arith.constant 1 : index
+          %c0_4 = arith.constant 0 : index
+          %cst = arith.constant 0.000000e+00 : f32
+          %cst_5 = arith.constant 6.400000e+01 : f32
+          %cst_6 = arith.constant 9.99999974E-6 : f32
+          %subview = memref.subview %arg24[%arg20, 0] [1, 64] [1, 1] : memref<2x64xbf16, 1 : i32> to memref<1x64xbf16, strided<[64, 1], offset: ?>, 1 : i32>
+          %alloc_7 = memref.alloc() : memref<1xf32, 2>
+          memref.store %cst, %alloc_7[%c0_4] : memref<1xf32, 2>
+          linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "reduction"]} ins(%subview : memref<1x64xbf16, strided<[64, 1], offset: ?>, 1 : i32>) outs(%alloc_7 : memref<1xf32, 2>) {
+          ^bb0(%in: bf16, %out: f32):
+            %1 = arith.extf %in : bf16 to f32
+            %2 = arith.mulf %1, %1 : f32
+            %3 = arith.addf %2, %out : f32
+            linalg.yield %3 : f32
+          }
+          %alloc_8 = memref.alloc() : memref<1x64xbf16, 2>
+          linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel"]} ins(%subview, %alloc_7 : memref<1x64xbf16, strided<[64, 1], offset: ?>, 1 : i32>, memref<1xf32, 2>) outs(%alloc_8 : memref<1x64xbf16, 2>) {
+          ^bb0(%in: bf16, %in_9: f32, %out: bf16):
+            %1 = arith.divf %in_9, %cst_5 : f32
+            %2 = arith.addf %1, %cst_6 : f32
+            %3 = math.rsqrt %2 : f32
+            %4 = arith.extf %in : bf16 to f32
+            %5 = arith.mulf %4, %3 : f32
+            %6 = arith.truncf %5 : f32 to bf16
+            linalg.yield %6 : bf16
+          }
+          memref.dealloc %alloc_7 : memref<1xf32, 2>
+          memref.dealloc %alloc_8 : memref<1x64xbf16, 2>
+        }
+        air.dma_memcpy_nd (%arg19[%c0, %0] [%c2, %c64] [%c64, %c1_0], %alloc_1[] [] []) {id = 2 : i32} : (memref<*xbf16>, memref<2x64xbf16, 1>)
+        memref.dealloc %alloc_1 : memref<2x64xbf16, 1>
+      }
+    }
+    return
+  }
+}
@@ -0,0 +1,55 @@
+#map = affine_map<(d0, d1) -> (d0, d1)>
+#map1 = affine_map<(d0, d1) -> (d0)>
+module {
+  func.func @rms_norm_kernel(%arg0: memref<*xbf16> {tt.divisibility = 16 : i32}, %arg1: memref<*xbf16> {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32) {
+    %c1 = arith.constant 1 : index
+    %c16 = arith.constant 16 : index
+    air.launch (%arg9, %arg10, %arg11) in (%arg12=%c16, %arg13=%c1, %arg14=%c1) args(%arg15=%arg0, %arg16=%arg1) : memref<*xbf16>, memref<*xbf16> {
+      air.segment @rms_norm_kernel_0  args(%arg17=%arg10, %arg18=%arg15, %arg19=%arg16) : index, memref<*xbf16>, memref<*xbf16> {
+        %c0 = arith.constant 0 : index
+        %c64 = arith.constant 64 : index
+        %c2 = arith.constant 2 : index
+        %c1_0 = arith.constant 1 : index
+        %c128 = arith.constant 128 : index
+        %0 = arith.muli %arg17, %c128 : index
+        %alloc = memref.alloc() : memref<2x64xbf16, 1 : i32>
+        air.dma_memcpy_nd (%alloc[] [] [], %arg18[%c0, %0] [%c2, %c64] [%c64, %c1_0]) {id = 1 : i32} : (memref<2x64xbf16, 1 : i32>, memref<*xbf16>)
+        %alloc_1 = memref.alloc() : memref<2x64xbf16, 1>
+        air.herd @herd_0  tile (%arg20, %arg21) in (%arg22=%c2, %arg23=%c1_0) args(%arg24=%alloc, %arg25=%alloc_1) : memref<2x64xbf16, 1 : i32>, memref<2x64xbf16, 1> {
+          %c64_2 = arith.constant 64 : index
+          %c1_3 = arith.constant 1 : index
+          %c0_4 = arith.constant 0 : index
+          %cst = arith.constant 0.000000e+00 : f32
+          %cst_5 = arith.constant 6.400000e+01 : f32
+          %cst_6 = arith.constant 9.99999974E-6 : f32
+          %subview = memref.subview %arg24[%arg20, 0] [1, 64] [1, 1] : memref<2x64xbf16, 1 : i32> to memref<1x64xbf16, strided<[64, 1], offset: ?>, 1 : i32>
+          %alloc_7 = memref.alloc() : memref<1xf32, 2>
+          memref.store %cst, %alloc_7[%c0_4] : memref<1xf32, 2>
+          linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "reduction"]} ins(%subview : memref<1x64xbf16, strided<[64, 1], offset: ?>, 1 : i32>) outs(%alloc_7 : memref<1xf32, 2>) {
+          ^bb0(%in: bf16, %out: f32):
+            %1 = arith.extf %in : bf16 to f32
+            %2 = arith.mulf %1, %1 : f32
+            %3 = arith.addf %2, %out : f32
+            linalg.yield %3 : f32
+          }
+          %alloc_8 = memref.alloc() : memref<1x64xbf16, 2>
+          linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel"]} ins(%subview, %alloc_7 : memref<1x64xbf16, strided<[64, 1], offset: ?>, 1 : i32>, memref<1xf32, 2>) outs(%alloc_8 : memref<1x64xbf16, 2>) {
+          ^bb0(%in: bf16, %in_9: f32, %out: bf16):
+            %1 = arith.divf %in_9, %cst_5 : f32
+            %2 = arith.addf %1, %cst_6 : f32
+            %3 = math.rsqrt %2 : f32
+            %4 = arith.extf %in : bf16 to f32
+            %5 = arith.mulf %4, %3 : f32
+            %6 = arith.truncf %5 : f32 to bf16
+            linalg.yield %6 : bf16
+          }
+          memref.dealloc %alloc_7 : memref<1xf32, 2>
+          memref.dealloc %alloc_8 : memref<1x64xbf16, 2>
+        }
+        air.dma_memcpy_nd (%arg19[%c0, %0] [%c2, %c64] [%c64, %c1_0], %alloc_1[] [] []) {id = 2 : i32} : (memref<*xbf16>, memref<2x64xbf16, 1>)
+        memref.dealloc %alloc_1 : memref<2x64xbf16, 1>
+      }
+    }
+    return
+  }
+}