Skip to content

Commit a852a42

Browse files
authored
[CPU] Enable masking by default if the target has avx512 feature. (#23470)
AVX-512 has native opmask registers, so we can enable masking by default when the target has such feature. Peeling usually requires more stack buffer, so the switching can address such issue. When peeling is used, it goes to `setMatmulPeelingRootConfig`, which sets the tile sizes aggresively. Although the change itself may introduce performance regressions on x86, but matmul path is rarely used. It is mostly for functionality and people should use data-tiling if they are looking for performance on x86. It is possible to adjust the tile sizes like the other one, but it could also impact other SVE targets. Since this path is seldom used and not recommended on x86, the revision accepts the regressions. Fixes #23464 Signed-off-by: hanhanW <hanhan0912@gmail.com>
1 parent 21df09d commit a852a42

3 files changed

Lines changed: 26 additions & 71 deletions

File tree

compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -265,6 +265,11 @@ getVectorPreProcStrategy(linalg::LinalgOp linalgOp) {
265265
return VectorPreProcStrategy::Masking;
266266
}
267267

268+
// AVX-512 has native opmask registers for efficient masking.
269+
if (hasAVX512fFeature(targetAttr.getConfiguration())) {
270+
return VectorPreProcStrategy::Masking;
271+
}
272+
268273
if (!clDisableVectorPeeling) {
269274
return VectorPreProcStrategy::Peeling;
270275
}

compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_tests.mlir

Lines changed: 1 addition & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -49,56 +49,6 @@ func.func @check_no_cse() attributes {hal.executable.target = #executable_target
4949

5050
// -----
5151

52-
#pipeline_layout = #hal.pipeline.layout<constants = 4, bindings = [
53-
#hal.pipeline.binding<storage_buffer>,
54-
#hal.pipeline.binding<storage_buffer>,
55-
#hal.pipeline.binding<storage_buffer>
56-
]>
57-
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-none-elf"}>
58-
#map = affine_map<(d0, d1) -> (d0, d1)>
59-
func.func @peel_partially_unaligned_matmul() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} {
60-
%cst = arith.constant 0.000000e+00 : f32
61-
%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32
62-
%1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32
63-
%2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32
64-
%3 = hal.interface.constant.load layout(#pipeline_layout) ordinal(3) : i32
65-
%4 = arith.index_castui %0 {stream.alignment = 128 : index, stream.values = [0 : index, 131712 : index]} : i32 to index
66-
%5 = arith.index_castui %1 {stream.alignment = 64 : index, stream.values = [576704 : index, 1763072 : index]} : i32 to index
67-
%6 = arith.index_castui %2 {stream.alignment = 64 : index, stream.values = [908480 : index, 2094848 : index]} : i32 to index
68-
%7 = arith.index_castui %3 {stream.alignment = 128 : index, stream.values = [2304 : index, 134016 : index]} : i32 to index
69-
%8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%4) flags(ReadOnly) : !iree_tensor_ext.dispatch.tensor<readonly:tensor<1x576xf32>>
70-
%9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%5) flags(ReadOnly) : !iree_tensor_ext.dispatch.tensor<readonly:tensor<576x144xf32>>
71-
%10 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6) flags(ReadOnly) : !iree_tensor_ext.dispatch.tensor<readonly:tensor<1x144xf32>>
72-
%11 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%7) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<1x144xf32>>
73-
%12 = iree_tensor_ext.dispatch.tensor.load %8, offsets = [0, 0], sizes = [1, 576], strides = [1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<1x576xf32>> -> tensor<1x576xf32>
74-
%13 = iree_tensor_ext.dispatch.tensor.load %9, offsets = [0, 0], sizes = [576, 144], strides = [1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<576x144xf32>> -> tensor<576x144xf32>
75-
%14 = iree_tensor_ext.dispatch.tensor.load %10, offsets = [0, 0], sizes = [1, 144], strides = [1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<1x144xf32>> -> tensor<1x144xf32>
76-
%15 = tensor.empty() : tensor<1x144xf32>
77-
%16 = linalg.fill ins(%cst : f32) outs(%15 : tensor<1x144xf32>) -> tensor<1x144xf32>
78-
%17 = linalg.matmul ins(%12, %13 : tensor<1x576xf32>, tensor<576x144xf32>) outs(%16 : tensor<1x144xf32>) -> tensor<1x144xf32>
79-
%18 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%17, %14 : tensor<1x144xf32>, tensor<1x144xf32>) outs(%15 : tensor<1x144xf32>) {
80-
^bb0(%in: f32, %in_0: f32, %out: f32):
81-
%19 = arith.addf %in, %in_0 : f32
82-
%20 = arith.maximumf %19, %cst : f32
83-
linalg.yield %20 : f32
84-
} -> tensor<1x144xf32>
85-
iree_tensor_ext.dispatch.tensor.store %18, %11, offsets = [0, 0], sizes = [1, 144], strides = [1, 1] : tensor<1x144xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<1x144xf32>>
86-
return
87-
}
88-
// Checks that the bounded stack allocation are created.
89-
// CHECK-LABEL: func.func @peel_partially_unaligned_matmul
90-
// Main loop:
91-
// CHECK: vector.fma
92-
// CHECK: arith.addf {{.*}} : vector<
93-
// CHECK: arith.maximumf {{.*}} : vector<
94-
//
95-
// Peeled loop:
96-
// CHECK: vector.fma
97-
// CHECK: arith.addf {{.*}} : vector<
98-
// CHECK: arith.maximumf {{.*}} : vector<
99-
100-
// -----
101-
10252
#pipeline_layout = #hal.pipeline.layout<constants = 6, bindings = [
10353
#hal.pipeline.binding<storage_buffer>,
10454
#hal.pipeline.binding<storage_buffer>,
@@ -252,7 +202,7 @@ func.func @multi_result() attributes {hal.executable.target = #executable_target
252202
// CHECK: scf.for
253203
// CHECK: scf.for
254204
// CHECK-COUNT-16: vector.fma
255-
// CHECK: arith.addf %{{.+}}, %{{.+}} : vector<8x32xf32>
205+
// CHECK: arith.addf %{{.+}}, %{{.+}} : vector<8x16xf32>
256206

257207
// -----
258208

compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_x86_64_lowering_strategy.mlir

Lines changed: 20 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,8 @@ func.func @matvec_static(%3: tensor<128x384xf32>, %4: tensor<384xf32>) -> tensor
2020
} -> tensor<128xf32>
2121
return %7 : tensor<128xf32>
2222
}
23-
// CHECK-DAG: #[[CONFIG:.+]] = #iree_cpu.lowering_config<cache_parallel = [32, 0], distribution = [32, 0], vector_common_parallel = [32, 0], vector_reduction = [0, 16]>
24-
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert, {{\{}}enable_loop_peeling}>
23+
// CHECK-DAG: #[[CONFIG:.+]] = #iree_cpu.lowering_config<distribution = [32, 0], vector_common_parallel = [16, 0], vector_reduction = [0, 16]>
24+
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert>
2525
// CHECK: func.func @matvec_static(
2626
// CHECK-SAME: translation_info = #[[TRANSLATION]]
2727
// CHECK: linalg.generic
@@ -36,8 +36,8 @@ func.func @matvec_dynamic(%11: tensor<?xf32>, %12: tensor<?x?xf32>, %13: tensor<
3636
%15 = linalg.matvec ins(%12, %13 : tensor<?x?xf32>, tensor<?xf32>) outs(%14 : tensor<?xf32>) -> tensor<?xf32>
3737
return %15 : tensor<?xf32>
3838
}
39-
// CHECK-DAG: #[[CONFIG:.+]] = #iree_cpu.lowering_config<cache_parallel = [64, 0], distribution = [64, 0], vector_common_parallel = [32, 0], vector_reduction = [0, 16]>
40-
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert, {{\{}}enable_loop_peeling}>
39+
// CHECK-DAG: #[[CONFIG:.+]] = #iree_cpu.lowering_config<distribution = [64, 0], vector_common_parallel = [16, 0], vector_reduction = [0, 16]>
40+
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert>
4141
// CHECK: func.func @matvec_dynamic(
4242
// CHECK-SAME: translation_info = #[[TRANSLATION]]
4343
// CHECK: linalg.matvec
@@ -54,7 +54,7 @@ func.func @dot_static(%3: tensor<384xf32>, %4: tensor<384xf32>) -> tensor<f32> a
5454
return %7 : tensor<f32>
5555
}
5656
// CHECK-DAG: #[[CONFIG:.+]] = #iree_cpu.lowering_config<distribution = [0], vector_reduction = [16]
57-
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert, {{\{}}enable_loop_peeling}>
57+
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert>
5858
// CHECK: func.func @dot_static(
5959
// CHECK-SAME: translation_info = #[[TRANSLATION]]
6060
// CHECK: linalg.dot
@@ -70,7 +70,7 @@ func.func @dot_dynamic(%5: tensor<f32>, %8: tensor<?xf32>, %9: tensor<?xf32>) ->
7070
return %11 : tensor<f32>
7171
}
7272
// CHECK-DAG: #[[CONFIG:.+]] = #iree_cpu.lowering_config<distribution = [0], vector_reduction = [16]>
73-
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert, {{\{}}enable_loop_peeling}>
73+
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert>
7474
// CHECK: func.func @dot_dynamic(
7575
// CHECK-SAME: translation_info = #[[TRANSLATION]]
7676
// CHECK: linalg.dot
@@ -169,8 +169,8 @@ func.func @matmul_partially_peel(%3: tensor<16641x16xf32>, %4: tensor<16x8xf32>)
169169
%7 = linalg.matmul ins(%3, %4 : tensor<16641x16xf32>, tensor<16x8xf32>) outs(%6 : tensor<16641x8xf32>) -> tensor<16641x8xf32>
170170
return %7 : tensor<16641x8xf32>
171171
}
172-
// CHECK-DAG: #[[CONFIG:.+]] = #iree_cpu.lowering_config<cache_parallel = [43, 8, 0], distribution = [43, 8, 0], vector_common_parallel = [8, 32, 0], vector_reduction = [0, 0, 16]>
173-
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert, {{\{}}enable_loop_peeling}>
172+
// CHECK-DAG: #[[CONFIG:.+]] = #iree_cpu.lowering_config<distribution = [43, 8, 0], vector_common_parallel = [3, 8, 0], vector_reduction = [0, 0, 16]>
173+
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert>
174174
// CHECK: func.func @matmul_partially_peel(
175175
// CHECK-SAME: translation_info = #[[TRANSLATION]]
176176
// CHECK: linalg.matmul
@@ -423,8 +423,8 @@ func.func @matmul_static(%3: tensor<384x512xf32>, %4: tensor<512x128xf32>) -> te
423423
%7 = linalg.matmul ins(%3, %4 : tensor<384x512xf32>, tensor<512x128xf32>) outs(%6 : tensor<384x128xf32>) -> tensor<384x128xf32>
424424
return %7 : tensor<384x128xf32>
425425
}
426-
// CHECK-DAG: #[[CONFIG:.+]] = #iree_cpu.lowering_config<cache_parallel = [48, 64, 0], distribution = [48, 64, 0], vector_common_parallel = [8, 32, 0], vector_reduction = [0, 0, 16]>
427-
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert, {{\{}}enable_loop_peeling}>
426+
// CHECK-DAG: #[[CONFIG:.+]] = #iree_cpu.lowering_config<distribution = [48, 64, 0], vector_common_parallel = [8, 16, 0], vector_reduction = [0, 0, 16]>
427+
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert>
428428
// CHECK: func.func @matmul_static(
429429
// CHECK-SAME: translation_info = #[[TRANSLATION]]
430430
// CHECK: linalg.matmul
@@ -486,8 +486,8 @@ func.func @gemm_unit_N(%5: tensor<?x1xf32>, %6: tensor<?x?xf32>, %7: tensor<?x1x
486486
%8 = linalg.matmul ins(%6, %5 : tensor<?x?xf32>, tensor<?x1xf32>) outs(%7 : tensor<?x1xf32>) -> tensor<?x1xf32>
487487
return %8 : tensor<?x1xf32>
488488
}
489-
// CHECK-DAG: #[[CONFIG:.+]] = #iree_cpu.lowering_config<cache_parallel = [64, 0, 0], distribution = [64, 0, 0], vector_common_parallel = [8, 32, 0], vector_reduction = [0, 0, 16]>
490-
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert, {{\{}}enable_loop_peeling}>
489+
// CHECK-DAG: #[[CONFIG:.+]] = #iree_cpu.lowering_config<distribution = [64, 0, 0], vector_common_parallel = [8, 1, 0], vector_reduction = [0, 0, 16]>
490+
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert>
491491
// CHECK: func.func @gemm_unit_N(
492492
// CHECK-SAME: translation_info = #[[TRANSLATION]]
493493
// CHECK: linalg.matmul
@@ -500,8 +500,8 @@ func.func @gemm_unit_M_unit_N(%4: tensor<1x?xf32>, %5: tensor<?x1xf32>, %6: tens
500500
%7 = linalg.matmul ins(%4, %5 : tensor<1x?xf32>, tensor<?x1xf32>) outs(%6 : tensor<1x1xf32>) -> tensor<1x1xf32>
501501
return %7 : tensor<1x1xf32>
502502
}
503-
// CHECK-DAG: #[[CONFIG:.+]] = #iree_cpu.lowering_config<distribution = [0, 0, 0], vector_common_parallel = [1, 32, 0], vector_reduction = [0, 0, 16]>
504-
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert, {{\{}}enable_loop_peeling}>
503+
// CHECK-DAG: #[[CONFIG:.+]] = #iree_cpu.lowering_config<distribution = [0, 0, 0], vector_common_parallel = [1, 1, 0], vector_reduction = [0, 0, 16]>
504+
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert>
505505
// CHECK: func.func @gemm_unit_M_unit_N(
506506
// CHECK-SAME: translation_info = #[[TRANSLATION]]
507507
// CHECK: linalg.matmul
@@ -517,8 +517,8 @@ func.func @matmul_odd(%4: tensor<33x16xf32>, %5: tensor<16x49xf32>, %6: tensor<3
517517
%9 = linalg.matmul ins(%4, %5 : tensor<33x16xf32>, tensor<16x49xf32>) outs(%8 : tensor<33x49xf32>) -> tensor<33x49xf32>
518518
return %9 : tensor<33x49xf32>
519519
}
520-
// CHECK-DAG: #[[CONFIG:.+]] = #iree_cpu.lowering_config<cache_parallel = [8, 49, 0], distribution = [8, 49, 0], vector_common_parallel = [8, 32, 0], vector_reduction = [0, 0, 16]>
521-
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert, {{\{}}enable_loop_peeling}>
520+
// CHECK-DAG: #[[CONFIG:.+]] = #iree_cpu.lowering_config<distribution = [8, 49, 0], vector_common_parallel = [3, 7, 0], vector_reduction = [0, 0, 16]>
521+
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert>
522522
// CHECK: func.func @matmul_odd(
523523
// CHECK-SAME: translation_info = #[[TRANSLATION]]
524524
// CHECK: linalg.matmul
@@ -997,7 +997,7 @@ func.func @non_trivial_program(%3: tensor<128x1x128x1xf32>, %4: tensor<128x1xf32
997997
%10 = linalg.matmul ins(%expanded, %4 : tensor<1x128xf32>, tensor<128x1xf32>) outs(%8 : tensor<1x1xf32>) -> tensor<1x1xf32>
998998
return %10 : tensor<1x1xf32>
999999
}
1000-
// CHECK-DAG: #[[CONFIG:.+]] = #iree_cpu.lowering_config<distribution = [0, 0, 0], vector_common_parallel = [1, 32, 0], vector_reduction = [0, 0, 16]>
1000+
// CHECK-DAG: #[[CONFIG:.+]] = #iree_cpu.lowering_config<distribution = [0, 0, 0], vector_common_parallel = [1, 1, 0], vector_reduction = [0, 0, 16]>
10011001
// CHECK-NOT: lowering_config
10021002
// CHECK: func.func @non_trivial_program(
10031003
// CHECK-SAME: translation_info = #[[TRANSLATION]]
@@ -1259,9 +1259,9 @@ func.func @custom_op(%arg0 : tensor<384x512xf32>, %arg1 : tensor<512x128xf32>,
12591259
return %1 : tensor<384x128xf32>
12601260
}
12611261
// CHECK-DAG: #[[CONFIG0:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[48, 64, 0]]>
1262-
// CHECK-DAG: #[[CONFIG1:.+]] = #iree_cpu.lowering_config<cache_parallel = [48, 64], vector_common_parallel = [8, 32]>
1263-
// CHECK-DAG: #[[CONFIG2:.+]] = #iree_cpu.lowering_config<cache_parallel = [48, 64, 0], distribution = [48, 64, 0], vector_common_parallel = [8, 32, 0], vector_reduction = [0, 0, 16]>
1264-
// CHECK-DAG: #[[TRANSLATION_INFO:.+]] = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert, {enable_loop_peeling}>
1262+
// CHECK-DAG: #[[CONFIG1:.+]] = #iree_cpu.lowering_config<vector_common_parallel = [8, 16]>
1263+
// CHECK-DAG: #[[CONFIG2:.+]] = #iree_cpu.lowering_config<distribution = [48, 64, 0], vector_common_parallel = [8, 16, 0], vector_reduction = [0, 0, 16]>
1264+
// CHECK-DAG: #[[TRANSLATION_INFO:.+]] = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert>
12651265
// CHECK: func @custom_op(
12661266
// CHECK-SAME: translation_info = #translation
12671267
// CHECK: iree_linalg_ext.custom_op

0 commit comments

Comments
 (0)