Skip to content

Commit 0d382e1

Browse files
alankellyxnnpack-bot
authored andcommitted
Use optimal microkernel on Cortex X3
PiperOrigin-RevId: 729004520
1 parent 9f9f69c commit 0d382e1

File tree

5 files changed

+15
-8
lines changed

5 files changed

+15
-8
lines changed

cmake/gen/aarch64_microkernels.cmake

+2-2
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,6 @@ SET(PROD_AARCH64_ASM_MICROKERNEL_SRCS
2828
src/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-cortex-a75-prfm.S
2929
src/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-cortex-a75.S
3030
src/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-ld128-2.S
31-
src/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-ld128-acc2.S
3231
src/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-ld128-acc4.S
3332
src/f32-gemm/gen/f32-gemm-4x2-minmax-asm-aarch64-neonfma-ld128.S
3433
src/f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch64-neonfma-cortex-a53-prfm.S
@@ -44,6 +43,7 @@ SET(PROD_AARCH64_ASM_MICROKERNEL_SRCS
4443
src/f32-gemm/gen/f32-gemm-6x8-minmax-asm-aarch64-neonfma-cortex-a75-prfm.S
4544
src/f32-gemm/gen/f32-gemm-6x8-minmax-asm-aarch64-neonfma-ld128.S
4645
src/f32-gemm/gen/f32-gemm-7x8-minmax-asm-aarch64-neonfma-ld128-2.S
46+
src/f32-gemm/gen/f32-gemm-8x8-minmax-asm-aarch64-neonfma-ld128-2.S
4747
src/f32-igemm/f32-igemm-4x8-minmax-asm-aarch64-neonfma-cortex-a55.S
4848
src/f32-igemm/f32-igemm-6x8-minmax-asm-aarch64-neonfma-cortex-a55.S
4949
src/f32-igemm/f32-igemm-6x8-minmax-asm-aarch64-neonfma-cortex-a73.S
@@ -134,6 +134,7 @@ SET(NON_PROD_AARCH64_ASM_MICROKERNEL_SRCS
134134
src/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-ld64-prfm.S
135135
src/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-ld64.S
136136
src/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-ld128-acc2-prfm.S
137+
src/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-ld128-acc2.S
137138
src/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-ld128-acc4-prfm.S
138139
src/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-ld128-prfm.S
139140
src/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-ld128.S
@@ -183,7 +184,6 @@ SET(NON_PROD_AARCH64_ASM_MICROKERNEL_SRCS
183184
src/f32-gemm/gen/f32-gemm-7x8-minmax-asm-aarch64-neonfma-ld64-2.S
184185
src/f32-gemm/gen/f32-gemm-8x8-minmax-asm-aarch64-neonfma-ld32-2.S
185186
src/f32-gemm/gen/f32-gemm-8x8-minmax-asm-aarch64-neonfma-ld64-2.S
186-
src/f32-gemm/gen/f32-gemm-8x8-minmax-asm-aarch64-neonfma-ld128-2.S
187187
src/f32-gemm/gen/f32-gemm-goi-1x8-minmax-asm-aarch64-neonfma-ld128-prfm.S
188188
src/f32-gemm/gen/f32-gemm-goi-1x8-minmax-asm-aarch64-neonfma-ld128.S
189189
src/f32-gemm/gen/f32-gemm-goi-4x8-minmax-asm-aarch64-neonfma-ld128.S

cmake/gen/rvv_microkernels.cmake

+1-1
Original file line numberDiff line numberDiff line change
@@ -54,10 +54,10 @@ SET(PROD_RVV_MICROKERNEL_SRCS
5454
src/f32-vrsqrt/gen/f32-vrsqrt-rvv-rsqrt-u4v.c
5555
src/qs8-dwconv/gen/qs8-dwconv-9p8vc-minmax-fp32-rvv.c
5656
src/qs8-dwconv/gen/qs8-dwconv-25p8vc-minmax-fp32-rvv.c
57+
src/qs8-f32-vcvt/gen/qs8-f32-vcvt-rvv-u2v.c
5758
src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-3p8vc-minmax-fp32-rvv.c
5859
src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-9p8vc-minmax-fp32-rvv.c
5960
src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-25p8vc-minmax-fp32-rvv.c
60-
src/qs8-f32-vcvt/gen/qs8-f32-vcvt-rvv-u2v.c
6161
src/qs8-vlrelu/gen/qs8-vlrelu-rvv-u2v.c
6262
src/qs8-vmul/gen/qs8-vmul-minmax-f32-rvv-u2v.c
6363
src/qs8-vmulc/gen/qs8-vmulc-minmax-f32-rvv-u2v.c

gen/aarch64_microkernels.bzl

+2-2
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,6 @@ PROD_AARCH64_ASM_MICROKERNEL_SRCS = [
2424
"src/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-cortex-a75-prfm.S",
2525
"src/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-cortex-a75.S",
2626
"src/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-ld128-2.S",
27-
"src/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-ld128-acc2.S",
2827
"src/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-ld128-acc4.S",
2928
"src/f32-gemm/gen/f32-gemm-4x2-minmax-asm-aarch64-neonfma-ld128.S",
3029
"src/f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch64-neonfma-cortex-a53-prfm.S",
@@ -40,6 +39,7 @@ PROD_AARCH64_ASM_MICROKERNEL_SRCS = [
4039
"src/f32-gemm/gen/f32-gemm-6x8-minmax-asm-aarch64-neonfma-cortex-a75-prfm.S",
4140
"src/f32-gemm/gen/f32-gemm-6x8-minmax-asm-aarch64-neonfma-ld128.S",
4241
"src/f32-gemm/gen/f32-gemm-7x8-minmax-asm-aarch64-neonfma-ld128-2.S",
42+
"src/f32-gemm/gen/f32-gemm-8x8-minmax-asm-aarch64-neonfma-ld128-2.S",
4343
"src/f32-igemm/f32-igemm-4x8-minmax-asm-aarch64-neonfma-cortex-a55.S",
4444
"src/f32-igemm/f32-igemm-6x8-minmax-asm-aarch64-neonfma-cortex-a55.S",
4545
"src/f32-igemm/f32-igemm-6x8-minmax-asm-aarch64-neonfma-cortex-a73.S",
@@ -131,6 +131,7 @@ NON_PROD_AARCH64_ASM_MICROKERNEL_SRCS = [
131131
"src/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-ld64-prfm.S",
132132
"src/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-ld64.S",
133133
"src/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-ld128-acc2-prfm.S",
134+
"src/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-ld128-acc2.S",
134135
"src/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-ld128-acc4-prfm.S",
135136
"src/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-ld128-prfm.S",
136137
"src/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-ld128.S",
@@ -180,7 +181,6 @@ NON_PROD_AARCH64_ASM_MICROKERNEL_SRCS = [
180181
"src/f32-gemm/gen/f32-gemm-7x8-minmax-asm-aarch64-neonfma-ld64-2.S",
181182
"src/f32-gemm/gen/f32-gemm-8x8-minmax-asm-aarch64-neonfma-ld32-2.S",
182183
"src/f32-gemm/gen/f32-gemm-8x8-minmax-asm-aarch64-neonfma-ld64-2.S",
183-
"src/f32-gemm/gen/f32-gemm-8x8-minmax-asm-aarch64-neonfma-ld128-2.S",
184184
"src/f32-gemm/gen/f32-gemm-goi-1x8-minmax-asm-aarch64-neonfma-ld128-prfm.S",
185185
"src/f32-gemm/gen/f32-gemm-goi-1x8-minmax-asm-aarch64-neonfma-ld128.S",
186186
"src/f32-gemm/gen/f32-gemm-goi-4x8-minmax-asm-aarch64-neonfma-ld128.S",

gen/rvv_microkernels.bzl

+7
Original file line numberDiff line numberDiff line change
@@ -48,10 +48,17 @@ PROD_RVV_MICROKERNEL_SRCS = [
4848
"src/f32-vrnd/gen/f32-vrndu-rvv-u4v.c",
4949
"src/f32-vrnd/gen/f32-vrndz-rvv-u4v.c",
5050
"src/f32-vrsqrt/gen/f32-vrsqrt-rvv-rsqrt-u4v.c",
51+
"src/qs8-dwconv/gen/qs8-dwconv-9p8vc-minmax-fp32-rvv.c",
52+
"src/qs8-dwconv/gen/qs8-dwconv-25p8vc-minmax-fp32-rvv.c",
5153
"src/qs8-f32-vcvt/gen/qs8-f32-vcvt-rvv-u2v.c",
54+
"src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-3p8vc-minmax-fp32-rvv.c",
55+
"src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-9p8vc-minmax-fp32-rvv.c",
56+
"src/qs8-qc8w-dwconv/gen/qs8-qc8w-dwconv-25p8vc-minmax-fp32-rvv.c",
5257
"src/qs8-vlrelu/gen/qs8-vlrelu-rvv-u2v.c",
5358
"src/qs8-vmul/gen/qs8-vmul-minmax-f32-rvv-u2v.c",
5459
"src/qs8-vmulc/gen/qs8-vmulc-minmax-f32-rvv-u2v.c",
60+
"src/qu8-dwconv/gen/qu8-dwconv-9p8vc-minmax-fp32-rvv.c",
61+
"src/qu8-dwconv/gen/qu8-dwconv-25p8vc-minmax-fp32-rvv.c",
5562
"src/qu8-f32-vcvt/gen/qu8-f32-vcvt-rvv-u2v.c",
5663
"src/qu8-vlrelu/gen/qu8-vlrelu-rvv-u2v.c",
5764
"src/qu8-vmul/gen/qu8-vmul-minmax-f32-rvv-u2v.c",

src/configs/gemm-config.c

+3-3
Original file line numberDiff line numberDiff line change
@@ -591,12 +591,12 @@ static void init_f32_gemm_config(void) {
591591
case cpuinfo_uarch_cortex_x3:
592592
case cpuinfo_uarch_neoverse_v2:
593593
// TODO(fbarchard): Implement asm with indexed inputs
594-
f32_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_minmax_ukernel_1x8__asm_aarch64_neonfma_ld128_acc2);
595-
f32_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(6)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_lane_ld128);
594+
f32_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_minmax_ukernel_1x8__asm_aarch64_neonfma_ld128_2);
595+
f32_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(8)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_minmax_ukernel_8x8__asm_aarch64_neonfma_ld128_2);
596596
f32_gemm_config.init.f32 = xnn_init_f32_minmax_scalar_params;
597597
f32_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_f32_gemm_gio_w;
598598
f32_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_x32_packw_gemm_goi_ukernel_x8__neon_ld4lane_u4_prfm;
599-
f32_gemm_config.mr = 6;
599+
f32_gemm_config.mr = 8;
600600
f32_gemm_config.nr = 8;
601601
break;
602602
case cpuinfo_uarch_cortex_a78:

0 commit comments

Comments
 (0)