Skip to content

Commit 873c3db

Browse files
authored
Merge branch 'master' into qs8_qu8_vprelu
2 parents 524c036 + ff2433d commit 873c3db

File tree

419 files changed

+12073
-12586
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

419 files changed

+12073
-12586
lines changed

CMakeLists.txt

+22-807
Large diffs are not rendered by default.

bench/CMakeLists.txt

+142
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,142 @@
1+
# Copyright (c) Facebook, Inc. and its affiliates.
2+
# All rights reserved.
3+
#
4+
# Copyright 2019 Google LLC
5+
#
6+
# This source code is licensed under the BSD-style license found in the
7+
# LICENSE file in the root directory of this source tree.
8+
9+
# ---[ Build google benchmark
10+
IF(NOT TARGET benchmark)
11+
IF(XNNPACK_USE_SYSTEM_LIBS)
12+
FIND_PACKAGE(benchmark REQUIRED)
13+
ELSE()
14+
SET(BENCHMARK_ENABLE_TESTING OFF CACHE BOOL "")
15+
ADD_SUBDIRECTORY(
16+
"${GOOGLEBENCHMARK_SOURCE_DIR}"
17+
"${CONFU_DEPENDENCIES_BINARY_DIR}/googlebenchmark")
18+
ENDIF()
19+
ENDIF()
20+
21+
ADD_LIBRARY(bench-utils STATIC utils.cc)
22+
TARGET_LINK_LIBRARIES(bench-utils PRIVATE benchmark::benchmark cpuinfo pthreadpool)
23+
TARGET_LINK_LIBRARIES(bench-utils PRIVATE xnnpack-base hardware-config)
24+
IF(XNNPACK_BUILD_LIBRARY)
25+
TARGET_LINK_LIBRARIES(bench-utils PRIVATE logging memory)
26+
ENDIF()
27+
28+
# Helper libraries
29+
ADD_LIBRARY(packq-benchmark STATIC packq-benchmark.cc)
30+
TARGET_LINK_LIBRARIES(packq-benchmark PRIVATE XNNPACK benchmark::benchmark bench-utils)
31+
IF(XNNPACK_ENABLE_KLEIDIAI)
32+
TARGET_LINK_LIBRARIES(packq-benchmark PRIVATE kleidiai)
33+
ENDIF()
34+
35+
ADD_LIBRARY(gemm-benchmark STATIC gemm-benchmark.cc)
36+
TARGET_LINK_LIBRARIES(gemm-benchmark PRIVATE XNNPACK benchmark::benchmark bench-utils)
37+
IF(XNNPACK_ENABLE_KLEIDIAI)
38+
TARGET_LINK_LIBRARIES(gemm-benchmark PUBLIC kleidiai)
39+
ENDIF()
40+
41+
ADD_SUBDIRECTORY(subgraph)
42+
ADD_SUBDIRECTORY(operators)
43+
44+
# ---[ Build microkernel-level microbenchmarks
45+
SET(MICROKERNEL_BENCHMARKS
46+
bf16-gemm
47+
f16-conv-hwc2chw
48+
f16-dwconv
49+
f16-dwconv2d-chw
50+
f16-f32acc-gemm
51+
f16-f32acc-igemm
52+
f16-f32acc-rdsum
53+
f16-f32acc-rsum
54+
f16-gemm
55+
f16-gemm-minmax
56+
f16-igemm
57+
f16-raddstoreexpminusmax
58+
f16-rmax
59+
f16-rmin
60+
f16-rminmax
61+
f16-rsum
62+
f16-spmm
63+
f32-bgemm
64+
f32-conv-hwc
65+
f32-conv-hwc2chw
66+
f32-dwconv
67+
f32-dwconv2d-chw
68+
f32-gemm
69+
f32-gemm-goi-minmax
70+
f32-gemm-minmax
71+
f32-igemm
72+
f32-qc4w-gemm
73+
f32-qc8w-gemm
74+
f32-raddexpminusmax
75+
f32-raddextexp
76+
f32-raddstoreexpminusmax
77+
f32-rdsum
78+
f32-rmax
79+
f32-rmin
80+
f32-rminmax
81+
f32-rsum
82+
f32-softmax
83+
f32-spmm
84+
f16-vcmul
85+
f32-vcmul
86+
f32-vscaleexpminusmax
87+
f32-vscaleextexp
88+
pf32-gemm-minmax
89+
qd8-f16-qb4w-gemm
90+
qd8-f16-qc4w-gemm
91+
qd8-f16-qc8w-gemm
92+
qd8-f32-qb4w-gemm
93+
qd8-f32-qc4w-gemm
94+
qd8-f32-qc8w-gemm
95+
qp8-f32-qc4w-gemm
96+
qp8-f32-qc8w-gemm
97+
qp8-f32-qb4w-gemm
98+
qs8-dwconv
99+
qs8-gemm
100+
qs8-qc8w-gemm-fp32
101+
qs8-rdsum
102+
qu8-rdsum
103+
qs8-rsum
104+
qu8-rsum
105+
qu8-gemm
106+
qu8-gemm-fp32
107+
qu8-gemm-rndnu
108+
x16-packw
109+
x32-packw
110+
x8-lut
111+
x8-packq
112+
x8-packw
113+
vunary
114+
vbinary
115+
xN-transposec
116+
xx-transposev)
117+
FOREACH(BENCH ${MICROKERNEL_BENCHMARKS})
118+
ADD_EXECUTABLE(${BENCH}-bench ${BENCH}.cc)
119+
IF(CMAKE_C_COMPILER_ID STREQUAL "GNU" AND NOT XNNPACK_TARGET_PROCESSOR MATCHES "^riscv")
120+
# Attempt to work around slow compilation/linking of benchmarks with a lot of functions
121+
TARGET_COMPILE_OPTIONS(${BENCH}-bench PRIVATE "-fno-function-sections")
122+
TARGET_COMPILE_OPTIONS(${BENCH}-bench PRIVATE "-Os")
123+
ENDIF()
124+
TARGET_LINK_LIBRARIES(${BENCH}-bench PRIVATE
125+
bench-utils
126+
benchmark::benchmark
127+
gemm-benchmark
128+
hardware-config
129+
indirection
130+
logging
131+
microkernels-all
132+
microparams-init
133+
packing
134+
packq-benchmark
135+
pthreadpool)
136+
SET_TARGET_PROPERTIES(${BENCH}-bench PROPERTIES CXX_EXTENSIONS YES)
137+
ENDFOREACH()
138+
139+
# Special-case
140+
IF(XNNPACK_ENABLE_KLEIDIAI)
141+
TARGET_LINK_LIBRARIES(x8-packq-bench PRIVATE kleidiai)
142+
ENDIF()

bench/bf16-gemm.cc

+1-2
Original file line numberDiff line numberDiff line change
@@ -44,8 +44,7 @@ static void bf16_gemm(benchmark::State& state,
4444
auto f32rng =
4545
std::bind(std::uniform_real_distribution<float>(), std::ref(rng));
4646

47-
xnnpack::Buffer<xnn_bfloat16> a(mc * kc +
48-
XNN_EXTRA_BYTES / sizeof(xnn_bfloat16));
47+
xnnpack::Buffer<xnn_bfloat16> a(mc * kc, xnnpack::XnnExtraBytes);
4948
std::generate(a.begin(), a.end(),
5049
[&] { return xnn_bfloat16_from_float(f32rng(rng)); });
5150
xnnpack::Buffer<xnn_bfloat16> k(nc * kc);

bench/f16-conv-hwc2chw.cc

+3-4
Original file line numberDiff line numberDiff line change
@@ -48,9 +48,8 @@ static void f16_conv_hwc2chw(
4848
const size_t output_width =
4949
(input_width + 2 * padding - kernel_size) / subsampling + 1;
5050

51-
xnnpack::Buffer<xnn_float16> input(input_height * input_width *
52-
input_channels +
53-
XNN_EXTRA_BYTES / sizeof(xnn_float16));
51+
xnnpack::Buffer<xnn_float16> input(
52+
input_height * input_width * input_channels, xnnpack::XnnExtraBytes);
5453
std::generate(input.begin(), input.end(), f32rng);
5554
xnnpack::Buffer<xnn_float16> kernel(output_channels * kernel_size *
5655
kernel_size * input_channels);
@@ -59,7 +58,7 @@ static void f16_conv_hwc2chw(
5958
std::generate(bias.begin(), bias.end(), f32rng);
6059

6160
xnnpack::Buffer<xnn_float16, XNN_ALLOCATION_ALIGNMENT> zero(
62-
input_channels * input_width + XNN_EXTRA_BYTES / sizeof(xnn_float16));
61+
input_channels * input_width, xnnpack::XnnExtraBytes);
6362

6463
const size_t weights_elements =
6564
(kernel_size * kernel_size * input_channels + 1) *

bench/f16-dwconv.cc

+3-4
Original file line numberDiff line numberDiff line change
@@ -73,16 +73,15 @@ static void bench_impl(uint64_t arch_flags, benchmark::State& state,
7373
const size_t c_stride =
7474
benchmark::utils::RoundUp<size_t>(channels, channel_tile);
7575

76-
xnnpack::Buffer<xnn_float16> a(channels * input_height * input_width +
77-
XNN_EXTRA_BYTES / sizeof(xnn_float16));
76+
xnnpack::Buffer<xnn_float16> a(channels * input_height * input_width,
77+
xnnpack::XnnExtraBytes);
7878
std::generate(a.begin(), a.end(), f32rng);
7979
xnnpack::Buffer<xnn_float16> k(channels * kernel_height * kernel_width);
8080
std::generate(k.begin(), k.end(), f32rng);
8181
xnnpack::Buffer<xnn_float16> b(channels);
8282
std::generate(b.begin(), b.end(), f32rng);
8383

84-
xnnpack::Buffer<xnn_float16> z(channels +
85-
XNN_EXTRA_BYTES / sizeof(xnn_float16));
84+
xnnpack::Buffer<xnn_float16> z(channels, xnnpack::XnnExtraBytes);
8685

8786
const size_t w_elements = (kernel_size + 1) * c_stride;
8887
// Can read (primary_tile - kernel_size) elements after end of indirection

bench/f16-f32acc-gemm.cc

+1-2
Original file line numberDiff line numberDiff line change
@@ -47,8 +47,7 @@ static void f16_gemm(benchmark::State& state,
4747
auto f32rng =
4848
std::bind(std::uniform_real_distribution<float>(), std::ref(rng));
4949

50-
xnnpack::Buffer<xnn_float16> a(mc * kc +
51-
XNN_EXTRA_BYTES / sizeof(xnn_float16));
50+
xnnpack::Buffer<xnn_float16> a(mc * kc, xnnpack::XnnExtraBytes);
5251
std::generate(a.begin(), a.end(), f32rng);
5352
xnnpack::Buffer<xnn_float16> k(nc * kc);
5453
std::generate(k.begin(), k.end(), f32rng);

bench/f16-f32acc-igemm.cc

+3-5
Original file line numberDiff line numberDiff line change
@@ -68,18 +68,16 @@ static void f16_igemm(benchmark::State& state,
6868
const size_t kc_stride =
6969
benchmark::utils::RoundUp<size_t>(group_input_channels, kr * sr);
7070

71-
xnnpack::Buffer<xnn_float16> a(input_height * input_width *
72-
input_pixel_stride +
73-
XNN_EXTRA_BYTES / sizeof(xnn_float16));
71+
xnnpack::Buffer<xnn_float16> a(
72+
input_height * input_width * input_pixel_stride, xnnpack::XnnExtraBytes);
7473
std::generate(a.begin(), a.end(), f32rng);
7574
xnnpack::Buffer<xnn_float16> k(group_output_channels * kernel_height *
7675
kernel_width * group_input_channels);
7776
std::generate(k.begin(), k.end(), f32rng);
7877
xnnpack::Buffer<xnn_float16> b(group_output_channels);
7978
std::generate(b.begin(), b.end(), f32rng);
8079

81-
xnnpack::Buffer<xnn_float16> z(group_input_channels +
82-
XNN_EXTRA_BYTES / sizeof(xnn_float16));
80+
xnnpack::Buffer<xnn_float16> z(group_input_channels, xnnpack::XnnExtraBytes);
8381

8482
const size_t w_elements = (kernel_size * kc_stride + 1) * nc_stride;
8583
const size_t i_elements = mc_stride * kernel_size;

bench/f16-gemm.cc

+1-2
Original file line numberDiff line numberDiff line change
@@ -46,8 +46,7 @@ static void f16_gemm(benchmark::State& state,
4646
auto f32rng =
4747
std::bind(std::uniform_real_distribution<float>(), std::ref(rng));
4848

49-
xnnpack::Buffer<xnn_float16> a(mc * kc +
50-
XNN_EXTRA_BYTES / sizeof(xnn_float16));
49+
xnnpack::Buffer<xnn_float16> a(mc * kc, xnnpack::XnnExtraBytes);
5150
std::generate(a.begin(), a.end(), f32rng);
5251
xnnpack::Buffer<xnn_float16> k(nc * kc);
5352
std::generate(k.begin(), k.end(), f32rng);

bench/f16-igemm.cc

+3-5
Original file line numberDiff line numberDiff line change
@@ -69,18 +69,16 @@ static void f16_igemm(benchmark::State& state,
6969
const size_t kc_stride =
7070
benchmark::utils::RoundUp<size_t>(group_input_channels, kr * sr);
7171

72-
xnnpack::Buffer<xnn_float16> a(input_height * input_width *
73-
input_pixel_stride +
74-
XNN_EXTRA_BYTES / sizeof(xnn_float16));
72+
xnnpack::Buffer<xnn_float16> a(
73+
input_height * input_width * input_pixel_stride, xnnpack::XnnExtraBytes);
7574
std::generate(a.begin(), a.end(), f32rng);
7675
xnnpack::Buffer<xnn_float16> k(group_output_channels * kernel_height *
7776
kernel_width * group_input_channels);
7877
std::generate(k.begin(), k.end(), f32rng);
7978
xnnpack::Buffer<xnn_float16> b(group_output_channels);
8079
std::generate(b.begin(), b.end(), f32rng);
8180

82-
xnnpack::Buffer<xnn_float16> z(group_input_channels +
83-
XNN_EXTRA_BYTES / sizeof(xnn_float16));
81+
xnnpack::Buffer<xnn_float16> z(group_input_channels, xnnpack::XnnExtraBytes);
8482

8583
const size_t w_elements = (kernel_size * kc_stride + 1) * nc_stride;
8684
const size_t i_elements = mc_stride * kernel_size;

bench/f16-vcmul.cc

+2-2
Original file line numberDiff line numberDiff line change
@@ -38,9 +38,9 @@ static void f16_vcmul(benchmark::State& state, uint64_t arch_flags,
3838
auto f16rng = std::bind(xnn_float16_from_float, f32rng);
3939

4040
xnnpack::Buffer<xnn_float16, XNN_ALLOCATION_ALIGNMENT> a(
41-
num_elements * 2 + XNN_EXTRA_BYTES / sizeof(xnn_float16));
41+
num_elements * 2, xnnpack::XnnExtraBytes);
4242
xnnpack::Buffer<xnn_float16, XNN_ALLOCATION_ALIGNMENT> b(
43-
num_elements * 2 + XNN_EXTRA_BYTES / sizeof(xnn_float16));
43+
num_elements * 2, xnnpack::XnnExtraBytes);
4444
xnnpack::Buffer<xnn_float16, XNN_ALLOCATION_ALIGNMENT> product(num_elements *
4545
2);
4646
std::generate(a.begin(), a.end(), std::ref(f16rng));

bench/f32-bgemm.cc

+3-6
Original file line numberDiff line numberDiff line change
@@ -54,8 +54,7 @@ static void f32_gemm(benchmark::State& state,
5454
auto f32rng =
5555
std::bind(std::uniform_real_distribution<float>(), std::ref(rng));
5656

57-
xnnpack::Buffer<float> a(batch * dim_m * dim_k +
58-
XNN_EXTRA_BYTES / sizeof(float));
57+
xnnpack::Buffer<float> a(batch * dim_m * dim_k, xnnpack::XnnExtraBytes);
5958
std::generate(a.begin(), a.end(), std::ref(f32rng));
6059
xnnpack::Buffer<float> b(batch * dim_n * dim_k);
6160
std::generate(b.begin(), b.end(), std::ref(f32rng));
@@ -121,8 +120,7 @@ static void f32_ppmm1p(benchmark::State& state,
121120
auto f32rng =
122121
std::bind(std::uniform_real_distribution<float>(), std::ref(rng));
123122

124-
xnnpack::Buffer<float> a(batch * dim_m * dim_k +
125-
XNN_EXTRA_BYTES / sizeof(float));
123+
xnnpack::Buffer<float> a(batch * dim_m * dim_k, xnnpack::XnnExtraBytes);
126124
std::generate(a.begin(), a.end(), std::ref(f32rng));
127125
xnnpack::Buffer<float> b(batch * dim_n * dim_k);
128126
std::generate(b.begin(), b.end(), std::ref(f32rng));
@@ -195,8 +193,7 @@ static void f32_ppmm2p(benchmark::State& state,
195193
auto f32rng =
196194
std::bind(std::uniform_real_distribution<float>(), std::ref(rng));
197195

198-
xnnpack::Buffer<float> a(batch * dim_m * dim_k +
199-
XNN_EXTRA_BYTES / sizeof(float));
196+
xnnpack::Buffer<float> a(batch * dim_m * dim_k, xnnpack::XnnExtraBytes);
200197
std::generate(a.begin(), a.end(), std::ref(f32rng));
201198
xnnpack::Buffer<float> b(batch * dim_n * dim_k);
202199
std::generate(b.begin(), b.end(), std::ref(f32rng));

bench/f32-conv-hwc.cc

+3-3
Original file line numberDiff line numberDiff line change
@@ -49,8 +49,8 @@ static void f32_conv_hwc(
4949
const size_t output_width =
5050
(input_width + 2 * padding - kernel_size) / subsampling + 1;
5151

52-
xnnpack::Buffer<float> input(input_height * input_width * input_channels +
53-
XNN_EXTRA_BYTES / sizeof(float));
52+
xnnpack::Buffer<float> input(input_height * input_width * input_channels,
53+
xnnpack::XnnExtraBytes);
5454
std::generate(input.begin(), input.end(), std::ref(f32rng));
5555
xnnpack::Buffer<float> kernel(output_channels * kernel_size * kernel_size *
5656
input_channels);
@@ -59,7 +59,7 @@ static void f32_conv_hwc(
5959
std::generate(bias.begin(), bias.end(), std::ref(f32rng));
6060

6161
xnnpack::Buffer<float, XNN_ALLOCATION_ALIGNMENT> zero(
62-
input_channels * input_width + XNN_EXTRA_BYTES / sizeof(float));
62+
input_channels * input_width, xnnpack::XnnExtraBytes);
6363

6464
const size_t weights_elements =
6565
(kernel_size * kernel_size * input_channels + 1) *

bench/f32-conv-hwc2chw.cc

+3-3
Original file line numberDiff line numberDiff line change
@@ -49,8 +49,8 @@ static void f32_conv_hwc2chw(
4949
const size_t output_width =
5050
(input_width + 2 * padding - kernel_size) / subsampling + 1;
5151

52-
xnnpack::Buffer<float> input(input_height * input_width * input_channels +
53-
XNN_EXTRA_BYTES / sizeof(float));
52+
xnnpack::Buffer<float> input(input_height * input_width * input_channels,
53+
xnnpack::XnnExtraBytes);
5454
std::generate(input.begin(), input.end(), std::ref(f32rng));
5555
xnnpack::Buffer<float> kernel(output_channels * kernel_size * kernel_size *
5656
input_channels);
@@ -59,7 +59,7 @@ static void f32_conv_hwc2chw(
5959
std::generate(bias.begin(), bias.end(), std::ref(f32rng));
6060

6161
xnnpack::Buffer<float, XNN_ALLOCATION_ALIGNMENT> zero(
62-
input_channels * input_width + XNN_EXTRA_BYTES / sizeof(float));
62+
input_channels * input_width, xnnpack::XnnExtraBytes);
6363

6464
const size_t weights_elements =
6565
(kernel_size * kernel_size * input_channels + 1) *

bench/f32-dwconv.cc

+3-3
Original file line numberDiff line numberDiff line change
@@ -72,15 +72,15 @@ static void bench_impl(uint64_t arch_flags, benchmark::State& state,
7272
const size_t c_stride =
7373
benchmark::utils::RoundUp<size_t>(channels, channel_tile);
7474

75-
xnnpack::Buffer<float> a(channels * input_height * input_width +
76-
XNN_EXTRA_BYTES / sizeof(float));
75+
xnnpack::Buffer<float> a(channels * input_height * input_width,
76+
xnnpack::XnnExtraBytes);
7777
std::generate(a.begin(), a.end(), std::ref(f32rng));
7878
xnnpack::Buffer<float> k(channels * kernel_height * kernel_width);
7979
std::generate(k.begin(), k.end(), std::ref(f32rng));
8080
xnnpack::Buffer<float> b(channels);
8181
std::generate(b.begin(), b.end(), std::ref(f32rng));
8282

83-
xnnpack::Buffer<float> z(channels + XNN_EXTRA_BYTES / sizeof(float));
83+
xnnpack::Buffer<float> z(channels, xnnpack::XnnExtraBytes);
8484

8585
const size_t w_elements = (kernel_size + 1) * c_stride;
8686
// Can read (primary_tile - kernel_size) elements after end of indirection

0 commit comments

Comments
 (0)