diff --git a/README.md b/README.md index 8486a71..bb3e207 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ OpenAPV provides the reference implementation of the [APV codec](#apv-codec) whi The OpenAPV supports the following features: - fully compliant with 422-10 and 400-10 profile of [APV codec](#apv-codec) -- Low complexity by optimization for ARM NEON and x86(64bit) SEE/AVX CPU +- Low complexity by optimization for ARM NEON and x86(64bit) SSE/AVX CPU - Supports tile-based multi-threading - Supports Various metadata including HDR10/10+ and user-defined format - Constant QP (CQP), average bitrate (ABR), and constant rate factor (CRF) are supported @@ -61,6 +61,12 @@ The APV codec standard has the following features: cmake --build build-arm ``` +- Build Instructions Android + ``` + cmake -G "MinGW Makefiles" -DCMAKE_BUILD_TYPE=Release -S . -B build-android -DCMAKE_TOOLCHAIN_FILE=/android-ndk-r26c/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=30 + cmake --build build-android + ``` + - Build Instructions Windows (Crosscompile) ``` cmake -S . -B build-windows -DCMAKE_TOOLCHAIN_FILE=windows_x86_64_toolchain.cmake diff --git a/src/neon/oapv_sad_neon.c b/src/neon/oapv_sad_neon.c index f494ae7..46dea4f 100644 --- a/src/neon/oapv_sad_neon.c +++ b/src/neon/oapv_sad_neon.c @@ -34,140 +34,70 @@ #if ARM_NEON +#if defined(__aarch64__) + +#define VADDVQ_S32_(s, v) \ + s += vaddvq_s32(v); +#define VADDVQ_S64_(s, v) \ + s += vaddvq_s64(v); + +#else // __aarch64__ + +#define VADDVQ_S32_(s, v) \ + { \ + int32x2_t tmp = vadd_s32(vget_low_s32(v), vget_high_s32(v)); \ + tmp = vpadd_s32(tmp, tmp); \ + s += vget_lane_s32(tmp, 0); \ + } +#define VADDVQ_S64_(s, v) \ + s += vgetq_lane_s64(v, 0) + vgetq_lane_s64(v, 1); +#define vabal_high_s16(a, b, c) \ + vabal_s16(a, vget_high_s16(b), vget_high_s16(c)) +#define vsubl_high_s16(a, b) \ + vsubl_s16(vget_high_s16(a), vget_high_s16(b)) +#define vmlal_high_s32(a, b, c) \ + vmlal_s32(a, vget_high_s32(b), vget_high_s32(c)) +#define vzip1q_s16(a, b) \ + vorrq_s32(vmovl_u16(vget_low_s16(a)), vshll_n_s16(vget_low_s16(b), 16)) +#define vzip1q_s32(a, b) \ + vorrq_s64(vmovl_u32(vget_low_s32(a)), vshll_n_s32(vget_low_s32(b), 32)) +#define vzip1q_s64(a, b) \ + vcombine_s64(vget_low_s64(a), vget_low_s64(b)) +#define vzip2q_s16(a, b) \ + vorrq_s32(vmovl_u16(vget_high_s16(a)), vshll_n_s16(vget_high_s16(b), 16)) +#define vzip2q_s32(a, b) \ + vorrq_s64(vmovl_u32(vget_high_s32(a)), vshll_n_s32(vget_high_s32(b), 32)) +#define vzip2q_s64(a, b) \ + vcombine_s64(vget_high_s64(a), vget_high_s64(b)) + +#endif // __aarch64__ + /* SAD for 16bit **************************************************************/ -int sad_16b_neon_8x2n(int w, int h, void *src1, void *src2, int s_src1, int s_src2) +static int sad_16b_neon_8x2n(int w, int h, void *src1, void *src2, int s_src1, int s_src2) { int sad = 0; s16* s1 = (s16*) src1; s16* s2 = (s16*) src2; int16x8_t s1_vector, s2_vector; - int32x4_t diff_part1, diff_part2, diff_part1_abs, diff_part2_abs, sad_vector, sad_vector_temp; - // Loop unrolled - { // Row 0 + int32x4_t sad_vector = vdupq_n_s32(0); + // Loop unrolling +#ifndef _MSC_VER +#pragma GCC unroll 8 +#endif + for (s32 i = 0; i < 8; ++i) + { // Row // Loading one row (8 elements) each of src1 and src_2 s1_vector = vld1q_s16(s1); s1 += s_src1; s2_vector = vld1q_s16(s2); s2 += s_src2; - - // Subtracting s1_vector from s2_vector and storing in 32 bits - diff_part1 = vsubl_s16(vget_low_s16(s1_vector), vget_low_s16(s2_vector)); - diff_part2 = vsubl_high_s16(s1_vector, s2_vector); - - //Taking absolute value of difference and adding them - diff_part1_abs = vabsq_s32(diff_part1); - diff_part2_abs = vabsq_s32(diff_part2); - - sad_vector = vaddq_s32(diff_part1_abs, diff_part2_abs); - } - { // Row 1 - s1_vector = vld1q_s16(s1); - s1 += s_src1; - s2_vector = vld1q_s16(s2); - s2 += s_src2; - - diff_part1 = vsubl_s16(vget_low_s16(s1_vector), vget_low_s16(s2_vector)); - diff_part2 = vsubl_high_s16(s1_vector, s2_vector); - - diff_part1_abs = vabsq_s32(diff_part1); - diff_part2_abs = vabsq_s32(diff_part2); - - sad_vector_temp = vaddq_s32(diff_part1_abs, diff_part2_abs); - // Updating sad_vector by adding the new values - sad_vector = vaddq_s32(sad_vector, sad_vector_temp); - } - { // Row 2 - s1_vector = vld1q_s16(s1); - s1 += s_src1; - s2_vector = vld1q_s16(s2); - s2 += s_src2; - - diff_part1 = vsubl_s16(vget_low_s16(s1_vector), vget_low_s16(s2_vector)); - diff_part2 = vsubl_high_s16(s1_vector, s2_vector); - - diff_part1_abs = vabsq_s32(diff_part1); - diff_part2_abs = vabsq_s32(diff_part2); - - sad_vector_temp = vaddq_s32(diff_part1_abs, diff_part2_abs); - sad_vector = vaddq_s32(sad_vector, sad_vector_temp); - } - { // Row 3 - s1_vector = vld1q_s16(s1); - s1 += s_src1; - s2_vector = vld1q_s16(s2); - s2 += s_src2; - - diff_part1 = vsubl_s16(vget_low_s16(s1_vector), vget_low_s16(s2_vector)); - diff_part2 = vsubl_high_s16(s1_vector, s2_vector); - - diff_part1_abs = vabsq_s32(diff_part1); - diff_part2_abs = vabsq_s32(diff_part2); - - sad_vector_temp = vaddq_s32(diff_part1_abs, diff_part2_abs); - sad_vector = vaddq_s32(sad_vector, sad_vector_temp); - } - { // Row 4 - s1_vector = vld1q_s16(s1); - s1 += s_src1; - s2_vector = vld1q_s16(s2); - s2 += s_src2; - - diff_part1 = vsubl_s16(vget_low_s16(s1_vector), vget_low_s16(s2_vector)); - diff_part2 = vsubl_high_s16(s1_vector, s2_vector); - - diff_part1_abs = vabsq_s32(diff_part1); - diff_part2_abs = vabsq_s32(diff_part2); - - sad_vector_temp = vaddq_s32(diff_part1_abs, diff_part2_abs); - sad_vector = vaddq_s32(sad_vector, sad_vector_temp); - } - { // Row 5 - s1_vector = vld1q_s16(s1); - s1 += s_src1; - s2_vector = vld1q_s16(s2); - s2 += s_src2; - - diff_part1 = vsubl_s16(vget_low_s16(s1_vector), vget_low_s16(s2_vector)); - diff_part2 = vsubl_high_s16(s1_vector, s2_vector); - - diff_part1_abs = vabsq_s32(diff_part1); - diff_part2_abs = vabsq_s32(diff_part2); - - sad_vector_temp = vaddq_s32(diff_part1_abs, diff_part2_abs); - sad_vector = vaddq_s32(sad_vector, sad_vector_temp); - } - { // Row 6 - s1_vector = vld1q_s16(s1); - s1 += s_src1; - s2_vector = vld1q_s16(s2); - s2 += s_src2; - - diff_part1 = vsubl_s16(vget_low_s16(s1_vector), vget_low_s16(s2_vector)); - diff_part2 = vsubl_high_s16(s1_vector, s2_vector); - - diff_part1_abs = vabsq_s32(diff_part1); - diff_part2_abs = vabsq_s32(diff_part2); - - sad_vector_temp = vaddq_s32(diff_part1_abs, diff_part2_abs); - sad_vector = vaddq_s32(sad_vector, sad_vector_temp); - } - { // Row 7 - s1_vector = vld1q_s16(s1); - s1 += s_src1; - s2_vector = vld1q_s16(s2); - s2 += s_src2; - - diff_part1 = vsubl_s16(vget_low_s16(s1_vector), vget_low_s16(s2_vector)); - diff_part2 = vsubl_high_s16(s1_vector, s2_vector); - - diff_part1_abs = vabsq_s32(diff_part1); - diff_part2_abs = vabsq_s32(diff_part2); - - sad_vector_temp = vaddq_s32(diff_part1_abs, diff_part2_abs); - sad_vector = vaddq_s32(sad_vector, sad_vector_temp); + + // Getting absolute difference s1_vector and s2_vector and storing in 32 bits + sad_vector = vabal_s16(sad_vector, vget_low_s16(s1_vector), vget_low_s16(s2_vector)); + sad_vector = vabal_high_s16(sad_vector, s1_vector, s2_vector); } // Adding all the elments in sad vector - sad = vaddvq_s32(sad_vector); + VADDVQ_S32_(sad, sad_vector) return sad; } @@ -182,13 +112,16 @@ static s64 ssd_16b_neon_8x8(int w, int h, void *src1, void *src2, int s_src1, in s64 ssd = 0; s16* s1 = (s16*) src1; s16* s2 = (s16*) src2; - s16 i; int16x8_t s1_vector, s2_vector; int32x4_t diff1, diff2; int32x2_t diff1_low, diff2_low; - int64x2_t sq_diff1_low, sq_diff1_high, sq_diff2_low, sq_diff2_high, sq_diff; - // Loop unrolling - { // Row 0 + int64x2_t sq_diff = vdupq_n_s64(0); + // Loop unrolling +#ifndef _MSC_VER +#pragma GCC unroll 8 +#endif + for (s32 i = 0; i < 8; ++i) + { // Row s1_vector = vld1q_s16(s1); s1 += s_src1; s2_vector = vld1q_s16(s2); @@ -199,163 +132,12 @@ static s64 ssd_16b_neon_8x8(int w, int h, void *src1, void *src2, int s_src1, in diff1_low = vget_low_s32(diff1); diff2_low = vget_low_s32(diff2); - sq_diff1_low = vmull_s32(diff1_low, diff1_low); - sq_diff1_high = vmull_high_s32(diff1, diff1); - sq_diff2_low = vmull_s32(diff2_low, diff2_low); - sq_diff2_high = vmull_high_s32(diff2, diff2); - - sq_diff = vaddq_s64(sq_diff1_low, sq_diff1_high); - sq_diff = vaddq_s64(sq_diff, sq_diff2_low); - sq_diff = vaddq_s64(sq_diff, sq_diff2_high); + sq_diff = vmlal_s32(sq_diff, diff1_low, diff1_low); + sq_diff = vmlal_high_s32(sq_diff, diff1, diff1); + sq_diff = vmlal_s32(sq_diff, diff2_low, diff2_low); + sq_diff = vmlal_high_s32(sq_diff, diff2, diff2); } - { // Row 1 - s1_vector = vld1q_s16(s1); - s1 += s_src1; - s2_vector = vld1q_s16(s2); - s2 += s_src2; - - diff1 = vsubl_s16(vget_low_s16(s1_vector), vget_low_s16(s2_vector)); - diff2 = vsubl_high_s16(s1_vector, s2_vector); - diff1_low = vget_low_s32(diff1); - diff2_low = vget_low_s32(diff2); - - sq_diff1_low = vmull_s32(diff1_low, diff1_low); - sq_diff1_high = vmull_high_s32(diff1, diff1); - sq_diff2_low = vmull_s32(diff2_low, diff2_low); - sq_diff2_high = vmull_high_s32(diff2, diff2); - - sq_diff = vaddq_s64(sq_diff, sq_diff1_low); - sq_diff = vaddq_s64(sq_diff, sq_diff1_high); - sq_diff = vaddq_s64(sq_diff, sq_diff2_low); - sq_diff = vaddq_s64(sq_diff, sq_diff2_high); - } - { // Row 2 - s1_vector = vld1q_s16(s1); - s1 += s_src1; - s2_vector = vld1q_s16(s2); - s2 += s_src2; - - diff1 = vsubl_s16(vget_low_s16(s1_vector), vget_low_s16(s2_vector)); - diff2 = vsubl_high_s16(s1_vector, s2_vector); - diff1_low = vget_low_s32(diff1); - diff2_low = vget_low_s32(diff2); - - sq_diff1_low = vmull_s32(diff1_low, diff1_low); - sq_diff1_high = vmull_high_s32(diff1, diff1); - sq_diff2_low = vmull_s32(diff2_low, diff2_low); - sq_diff2_high = vmull_high_s32(diff2, diff2); - - sq_diff = vaddq_s64(sq_diff, sq_diff1_low); - sq_diff = vaddq_s64(sq_diff, sq_diff1_high); - sq_diff = vaddq_s64(sq_diff, sq_diff2_low); - sq_diff = vaddq_s64(sq_diff, sq_diff2_high); - } - { // Row 3 - s1_vector = vld1q_s16(s1); - s1 += s_src1; - s2_vector = vld1q_s16(s2); - s2 += s_src2; - - diff1 = vsubl_s16(vget_low_s16(s1_vector), vget_low_s16(s2_vector)); - diff2 = vsubl_high_s16(s1_vector, s2_vector); - diff1_low = vget_low_s32(diff1); - diff2_low = vget_low_s32(diff2); - - sq_diff1_low = vmull_s32(diff1_low, diff1_low); - sq_diff1_high = vmull_high_s32(diff1, diff1); - sq_diff2_low = vmull_s32(diff2_low, diff2_low); - sq_diff2_high = vmull_high_s32(diff2, diff2); - - sq_diff = vaddq_s64(sq_diff, sq_diff1_low); - sq_diff = vaddq_s64(sq_diff, sq_diff1_high); - sq_diff = vaddq_s64(sq_diff, sq_diff2_low); - sq_diff = vaddq_s64(sq_diff, sq_diff2_high); - } - { // Row 4 - s1_vector = vld1q_s16(s1); - s1 += s_src1; - s2_vector = vld1q_s16(s2); - s2 += s_src2; - - diff1 = vsubl_s16(vget_low_s16(s1_vector), vget_low_s16(s2_vector)); - diff2 = vsubl_high_s16(s1_vector, s2_vector); - diff1_low = vget_low_s32(diff1); - diff2_low = vget_low_s32(diff2); - - sq_diff1_low = vmull_s32(diff1_low, diff1_low); - sq_diff1_high = vmull_high_s32(diff1, diff1); - sq_diff2_low = vmull_s32(diff2_low, diff2_low); - sq_diff2_high = vmull_high_s32(diff2, diff2); - - sq_diff = vaddq_s64(sq_diff, sq_diff1_low); - sq_diff = vaddq_s64(sq_diff, sq_diff1_high); - sq_diff = vaddq_s64(sq_diff, sq_diff2_low); - sq_diff = vaddq_s64(sq_diff, sq_diff2_high); - } - { // Row 5 - s1_vector = vld1q_s16(s1); - s1 += s_src1; - s2_vector = vld1q_s16(s2); - s2 += s_src2; - - diff1 = vsubl_s16(vget_low_s16(s1_vector), vget_low_s16(s2_vector)); - diff2 = vsubl_high_s16(s1_vector, s2_vector); - diff1_low = vget_low_s32(diff1); - diff2_low = vget_low_s32(diff2); - - sq_diff1_low = vmull_s32(diff1_low, diff1_low); - sq_diff1_high = vmull_high_s32(diff1, diff1); - sq_diff2_low = vmull_s32(diff2_low, diff2_low); - sq_diff2_high = vmull_high_s32(diff2, diff2); - - sq_diff = vaddq_s64(sq_diff, sq_diff1_low); - sq_diff = vaddq_s64(sq_diff, sq_diff1_high); - sq_diff = vaddq_s64(sq_diff, sq_diff2_low); - sq_diff = vaddq_s64(sq_diff, sq_diff2_high); - } - { // Row 6 - s1_vector = vld1q_s16(s1); - s1 += s_src1; - s2_vector = vld1q_s16(s2); - s2 += s_src2; - - diff1 = vsubl_s16(vget_low_s16(s1_vector), vget_low_s16(s2_vector)); - diff2 = vsubl_high_s16(s1_vector, s2_vector); - diff1_low = vget_low_s32(diff1); - diff2_low = vget_low_s32(diff2); - - sq_diff1_low = vmull_s32(diff1_low, diff1_low); - sq_diff1_high = vmull_high_s32(diff1, diff1); - sq_diff2_low = vmull_s32(diff2_low, diff2_low); - sq_diff2_high = vmull_high_s32(diff2, diff2); - - sq_diff = vaddq_s64(sq_diff, sq_diff1_low); - sq_diff = vaddq_s64(sq_diff, sq_diff1_high); - sq_diff = vaddq_s64(sq_diff, sq_diff2_low); - sq_diff = vaddq_s64(sq_diff, sq_diff2_high); - } - { // Row 7 - s1_vector = vld1q_s16(s1); - s1 += s_src1; - s2_vector = vld1q_s16(s2); - s2 += s_src2; - - diff1 = vsubl_s16(vget_low_s16(s1_vector), vget_low_s16(s2_vector)); - diff2 = vsubl_high_s16(s1_vector, s2_vector); - diff1_low = vget_low_s32(diff1); - diff2_low = vget_low_s32(diff2); - - sq_diff1_low = vmull_s32(diff1_low, diff1_low); - sq_diff1_high = vmull_high_s32(diff1, diff1); - sq_diff2_low = vmull_s32(diff2_low, diff2_low); - sq_diff2_high = vmull_high_s32(diff2, diff2); - - sq_diff = vaddq_s64(sq_diff, sq_diff1_low); - sq_diff = vaddq_s64(sq_diff, sq_diff1_high); - sq_diff = vaddq_s64(sq_diff, sq_diff2_low); - sq_diff = vaddq_s64(sq_diff, sq_diff2_high); - } - ssd += vaddvq_s64(sq_diff); + VADDVQ_S64_(ssd, sq_diff) return ssd; } @@ -370,8 +152,12 @@ static void diff_16b_neon_8x8(int w, int h, void *src1, void *src2, int s_src1, s16* s1 = (s16*) src1; s16* s2 = (s16*) src2; int16x8_t s1_vector, s2_vector, diff_vector; - // Loop unrolled - { // Row 0 + // Loop unrolling +#ifndef _MSC_VER +#pragma GCC unroll 8 +#endif + for (s32 i = 0; i < 8; ++i) + { // Row // Loading one row (8 elements) each of src1 and src_2 s1_vector = vld1q_s16(s1); s1 += s_src1; @@ -382,87 +168,11 @@ static void diff_16b_neon_8x8(int w, int h, void *src1, void *src2, int s_src1, diff_vector = vsubq_s16(s1_vector, s2_vector); // Storing the result in diff - vst1q_s16(diff, diff_vector); - diff += s_diff; - } - { // Row 1 - s1_vector = vld1q_s16(s1); - s1 += s_src1; - s2_vector = vld1q_s16(s2); - s2 += s_src2; - - diff_vector = vsubq_s16(s1_vector, s2_vector); - - vst1q_s16(diff, diff_vector); - diff += s_diff; - } - { // Row 2 - s1_vector = vld1q_s16(s1); - s1 += s_src1; - s2_vector = vld1q_s16(s2); - s2 += s_src2; - - diff_vector = vsubq_s16(s1_vector, s2_vector); - - vst1q_s16(diff, diff_vector); - diff += s_diff; - } - { // Row 3 - s1_vector = vld1q_s16(s1); - s1 += s_src1; - s2_vector = vld1q_s16(s2); - s2 += s_src2; - - diff_vector = vsubq_s16(s1_vector, s2_vector); - - vst1q_s16(diff, diff_vector); - diff += s_diff; - } - { // Row 4 - s1_vector = vld1q_s16(s1); - s1 += s_src1; - s2_vector = vld1q_s16(s2); - s2 += s_src2; - - diff_vector = vsubq_s16(s1_vector, s2_vector); - - vst1q_s16(diff, diff_vector); - diff += s_diff; - } - { // Row 5 - s1_vector = vld1q_s16(s1); - s1 += s_src1; - s2_vector = vld1q_s16(s2); - s2 += s_src2; - - diff_vector = vsubq_s16(s1_vector, s2_vector); - - vst1q_s16(diff, diff_vector); - diff += s_diff; - } - { // Row 6 - s1_vector = vld1q_s16(s1); - s1 += s_src1; - s2_vector = vld1q_s16(s2); - s2 += s_src2; - - diff_vector = vsubq_s16(s1_vector, s2_vector); - - vst1q_s16(diff, diff_vector); - diff += s_diff; - } - { // Row 7 - s1_vector = vld1q_s16(s1); - s1 += s_src1; - s2_vector = vld1q_s16(s2); - s2 += s_src2; - - diff_vector = vsubq_s16(s1_vector, s2_vector); - vst1q_s16(diff, diff_vector); diff += s_diff; } } + const oapv_fn_diff_t oapv_tbl_fn_diff_16b_neon[2] = { diff_16b_neon_8x8, NULL @@ -480,24 +190,23 @@ int oapv_dc_removed_had8x8_neon(pel* org, int s_org) int16x8_t pred4_8x16b, pred5_8x16b, pred6_8x16b, pred7_8x16b; int16x8_t out0_8x16b, out1_8x16b, out2_8x16b, out3_8x16b; int16x8_t out4_8x16b, out5_8x16b, out6_8x16b, out7_8x16b; - int16x8x2_t out0_8x16bx2, out1_8x16bx2, out2_8x16bx2, out3_8x16bx2; - - src0_8x16b = (vld1q_s16(&org[0])); - org = org + s_org; - src1_8x16b = (vld1q_s16(&org[0])); - org = org + s_org; - src2_8x16b = (vld1q_s16(&org[0])); - org = org + s_org; - src3_8x16b = (vld1q_s16(&org[0])); - org = org + s_org; - src4_8x16b = (vld1q_s16(&org[0])); - org = org + s_org; - src5_8x16b = (vld1q_s16(&org[0])); - org = org + s_org; - src6_8x16b = (vld1q_s16(&org[0])); - org = org + s_org; - src7_8x16b = (vld1q_s16(&org[0])); - org = org + s_org; + + src0_8x16b = (vld1q_s16(org)); + org += s_org; + src1_8x16b = (vld1q_s16(org)); + org += s_org; + src2_8x16b = (vld1q_s16(org)); + org += s_org; + src3_8x16b = (vld1q_s16(org)); + org += s_org; + src4_8x16b = (vld1q_s16(org)); + org += s_org; + src5_8x16b = (vld1q_s16(org)); + org += s_org; + src6_8x16b = (vld1q_s16(org)); + org += s_org; + src7_8x16b = (vld1q_s16(org)); + org += s_org; /**************** 8x8 horizontal transform *******************************/ /*********************** 8x8 16 bit Transpose ************************/ @@ -789,14 +498,14 @@ int oapv_dc_removed_had8x8_neon(pel* org, int s_org) s32* p = (s32*)&src0_8x16b; p[0] = 0; - satd = vaddvq_s32(src0_8x16b); - satd += vaddvq_s32(src1_8x16b); - satd += vaddvq_s32(src2_8x16b); - satd += vaddvq_s32(src3_8x16b); - satd += vaddvq_s32(src4_8x16b); - satd += vaddvq_s32(src5_8x16b); - satd += vaddvq_s32(src6_8x16b); - satd += vaddvq_s32(src7_8x16b); + VADDVQ_S32_(satd, src0_8x16b) + VADDVQ_S32_(satd, src1_8x16b) + VADDVQ_S32_(satd, src2_8x16b) + VADDVQ_S32_(satd, src3_8x16b) + VADDVQ_S32_(satd, src4_8x16b) + VADDVQ_S32_(satd, src5_8x16b) + VADDVQ_S32_(satd, src6_8x16b) + VADDVQ_S32_(satd, src7_8x16b) src0_8x16b = vabsq_s32(out0a_8x16b); src1_8x16b = vabsq_s32(out1a_8x16b); @@ -807,14 +516,14 @@ int oapv_dc_removed_had8x8_neon(pel* org, int s_org) src6_8x16b = vabsq_s32(out6a_8x16b); src7_8x16b = vabsq_s32(out7a_8x16b); - satd += vaddvq_s32(src0_8x16b); - satd += vaddvq_s32(src1_8x16b); - satd += vaddvq_s32(src2_8x16b); - satd += vaddvq_s32(src3_8x16b); - satd += vaddvq_s32(src4_8x16b); - satd += vaddvq_s32(src5_8x16b); - satd += vaddvq_s32(src6_8x16b); - satd += vaddvq_s32(src7_8x16b); + VADDVQ_S32_(satd, src0_8x16b) + VADDVQ_S32_(satd, src1_8x16b) + VADDVQ_S32_(satd, src2_8x16b) + VADDVQ_S32_(satd, src3_8x16b) + VADDVQ_S32_(satd, src4_8x16b) + VADDVQ_S32_(satd, src5_8x16b) + VADDVQ_S32_(satd, src6_8x16b) + VADDVQ_S32_(satd, src7_8x16b) satd = (satd + 2) >> 2; return satd; diff --git a/src/neon/oapv_tq_neon.c b/src/neon/oapv_tq_neon.c index e423cca..a269d89 100644 --- a/src/neon/oapv_tq_neon.c +++ b/src/neon/oapv_tq_neon.c @@ -47,18 +47,32 @@ const s32 oapv_coeff[8][4] = {18,-50, 75,-89} // 8th row coeff }; +#if !defined(__aarch64__) +#define vpaddq_s32(a, b) \ + vcombine_s32(vpadd_s32(vget_low_s32(a), vget_high_s32(a)), vpadd_s32(vget_low_s32(b), vget_high_s32(b))) +#define vmovl_high_s16(a) \ + vmovl_s16(vget_high_s16(a)) +#define vzip1_s16(a, b) \ + vget_low_s32(vorrq_s32(vmovl_u16(a), vshll_n_s16(b, 16))) +#define vzip1_s32(a, b) \ + vget_low_s64(vorrq_s64(vmovl_u32(a), vshll_n_s32(b, 32))) +#define vzip2_s16(a, b) \ + vget_high_s32(vorrq_s32(vmovl_u16(a), vshll_n_s16(b, 16))) +#define vzip2_s32(a, b) \ + vget_high_s64(vorrq_s64(vmovl_u32(a), vshll_n_s32(b, 32))) +#endif + #define multiply_s32(part1, part2, coeff, res) \ low = vmulq_s32(part1, coeff); \ high = vmulq_s32(part2, coeff); \ - res = vcombine_s32(vpadd_s32(vget_low_s32(low), vget_high_s32(low)), vpadd_s32(vget_low_s32(high), vget_high_s32(high))); \ + res = vpaddq_s32(low, high) -static void oapv_tx_pb8b_neon(s16 *src, s16 *dst, const int shift, int line) +static void oapv_tx_pb8b_neon(const s16 *src, s16 *dst, const int shift, int line) { s16 i; - s16 *tempSrc = src; + const s16 *tempSrc = src; int16x4_t src_part1, src_part2; int32x4_t coeff0, coeff1, coeff2, coeff3, coeff4, coeff5, coeff6, coeff7; - int32x4_t add = vdupq_n_s32(1 << (shift - 1)); int32x4_t sh = vdupq_n_s32(-shift); int32x4_t EE_part1, EE_part2, EO_part1, EO_part2, low, high, result0, result1, result2, result3, result4, result5, result6, result7, E1, O1, E2, O2, res1, res2, res3, res4; @@ -110,8 +124,8 @@ static void oapv_tx_pb8b_neon(s16 *src, s16 *dst, const int shift, int line) res2 = vpaddq_s32(result5, result7); // add and shift - res1 = vshlq_s32(vaddq_s32(res1, add), sh); - res2 = vshlq_s32(vaddq_s32(res2, add), sh); + res1 = vrshlq_s32(res1, sh); + res2 = vrshlq_s32(res2, sh); // Loading src[16 - 19] and src[20 - 23] src_part1 = vld1_s16(tempSrc); @@ -146,8 +160,8 @@ static void oapv_tx_pb8b_neon(s16 *src, s16 *dst, const int shift, int line) res4 = vpaddq_s32(result5, result7); // add and shift - res3 = vshlq_s32(vaddq_s32(res3, add), sh); - res4 = vshlq_s32(vaddq_s32(res4, add), sh); + res3 = vrshlq_s32(res3, sh); + res4 = vrshlq_s32(res4, sh); // store result in destination vst1_s16(dst + 1 * line + i, vmovn_s32(vcombine_s32(vget_low_s32(res1), vget_low_s32(res3)))); @@ -173,10 +187,10 @@ static void oapv_tx_pb8b_neon(s16 *src, s16 *dst, const int shift, int line) multiply_s32(EO_part1, EO_part2, coeff6, result6); // add and shift - result0 = vshlq_s32(vaddq_s32(result0, add), sh); - result2 = vshlq_s32(vaddq_s32(result2, add), sh); - result4 = vshlq_s32(vaddq_s32(result4, add), sh); - result6 = vshlq_s32(vaddq_s32(result6, add), sh); + result0 = vrshlq_s32(result0, sh); + result2 = vrshlq_s32(result2, sh); + result4 = vrshlq_s32(result4, sh); + result6 = vrshlq_s32(result6, sh); // store result in destination vst1_s16(dst + 0 * line + i, vmovn_s32(result0)); @@ -206,11 +220,8 @@ const oapv_fn_tx_t oapv_tbl_fn_txb_neon[2] = # define OAPV_INVTX_COEF_6 35 // coef21, coef60 # define OAPV_INVTX_COEF_4_LOG2 6 // log2(coef00), log2(coef01), log2(coef40), log2(-coef41) -void oapv_itx_pb8b_opt_neon(s16* src, int shift1, int shift2, int line) +static void oapv_itx_pb8b_opt_neon(s16* src, int shift1, int shift2, int line) { - int32x4_t add1 = vdupq_n_s32(1 << (shift1 - 1)); - int32x4_t add2 = vdupq_n_s32(1 << (shift2 - 1)); - int32x4_t sh1 = vdupq_n_s32(-shift1); int32x4_t sh2 = vdupq_n_s32(-shift2); @@ -297,22 +308,22 @@ void oapv_itx_pb8b_opt_neon(s16* src, int shift1, int shift2, int line) int32x4_t E6 = vsubq_s32(EE3, EO3); int32x4_t E7 = vsubq_s32(EE2, EO2); - dest0 = vmovn_s32(vshlq_s32(vaddq_s32(vaddq_s32(E0, O0), add1), sh1)); - dest1 = vmovn_s32(vshlq_s32(vaddq_s32(vaddq_s32(E1, O1), add1), sh1)); - dest2 = vmovn_s32(vshlq_s32(vaddq_s32(vaddq_s32(E2, O2), add1), sh1)); - dest3 = vmovn_s32(vshlq_s32(vaddq_s32(vaddq_s32(E3, O3), add1), sh1)); - dest4 = vmovn_s32(vshlq_s32(vaddq_s32(vsubq_s32(E0, O0), add1), sh1)); - dest5 = vmovn_s32(vshlq_s32(vaddq_s32(vsubq_s32(E1, O1), add1), sh1)); - dest6 = vmovn_s32(vshlq_s32(vaddq_s32(vsubq_s32(E2, O2), add1), sh1)); - dest7 = vmovn_s32(vshlq_s32(vaddq_s32(vsubq_s32(E3, O3), add1), sh1)); - dest8 = vmovn_s32(vshlq_s32(vaddq_s32(vaddq_s32(E4, O4), add1), sh1)); - dest9 = vmovn_s32(vshlq_s32(vaddq_s32(vaddq_s32(E5, O5), add1), sh1)); - dest10 = vmovn_s32(vshlq_s32(vaddq_s32(vaddq_s32(E6, O6), add1), sh1)); - dest11 = vmovn_s32(vshlq_s32(vaddq_s32(vaddq_s32(E7, O7), add1), sh1)); - dest12 = vmovn_s32(vshlq_s32(vaddq_s32(vsubq_s32(E4, O4), add1), sh1)); - dest13 = vmovn_s32(vshlq_s32(vaddq_s32(vsubq_s32(E5, O5), add1), sh1)); - dest14 = vmovn_s32(vshlq_s32(vaddq_s32(vsubq_s32(E6, O6), add1), sh1)); - dest15 = vmovn_s32(vshlq_s32(vaddq_s32(vsubq_s32(E7, O7), add1), sh1)); + dest0 = vmovn_s32(vrshlq_s32(vaddq_s32(E0, O0), sh1)); + dest1 = vmovn_s32(vrshlq_s32(vaddq_s32(E1, O1), sh1)); + dest2 = vmovn_s32(vrshlq_s32(vaddq_s32(E2, O2), sh1)); + dest3 = vmovn_s32(vrshlq_s32(vaddq_s32(E3, O3), sh1)); + dest4 = vmovn_s32(vrshlq_s32(vsubq_s32(E0, O0), sh1)); + dest5 = vmovn_s32(vrshlq_s32(vsubq_s32(E1, O1), sh1)); + dest6 = vmovn_s32(vrshlq_s32(vsubq_s32(E2, O2), sh1)); + dest7 = vmovn_s32(vrshlq_s32(vsubq_s32(E3, O3), sh1)); + dest8 = vmovn_s32(vrshlq_s32(vaddq_s32(E4, O4), sh1)); + dest9 = vmovn_s32(vrshlq_s32(vaddq_s32(E5, O5), sh1)); + dest10 = vmovn_s32(vrshlq_s32(vaddq_s32(E6, O6), sh1)); + dest11 = vmovn_s32(vrshlq_s32(vaddq_s32(E7, O7), sh1)); + dest12 = vmovn_s32(vrshlq_s32(vsubq_s32(E4, O4), sh1)); + dest13 = vmovn_s32(vrshlq_s32(vsubq_s32(E5, O5), sh1)); + dest14 = vmovn_s32(vrshlq_s32(vsubq_s32(E6, O6), sh1)); + dest15 = vmovn_s32(vrshlq_s32(vsubq_s32(E7, O7), sh1)); int16x4_t t0 = vzip1_s16(dest0, dest1); int16x4_t t1 = vzip1_s16(dest2, dest3); @@ -406,22 +417,22 @@ void oapv_itx_pb8b_opt_neon(s16* src, int shift1, int shift2, int line) int32x4_t E6 = vsubq_s32(EE3, EO3); int32x4_t E7 = vsubq_s32(EE2, EO2); - int16x4_t v_src_0 = vmovn_s32(vshlq_s32(vaddq_s32(vaddq_s32(E0, O0), add2), sh2)); - int16x4_t v_src_1 = vmovn_s32(vshlq_s32(vaddq_s32(vaddq_s32(E1, O1), add2), sh2)); - int16x4_t v_src_2 = vmovn_s32(vshlq_s32(vaddq_s32(vaddq_s32(E2, O2), add2), sh2)); - int16x4_t v_src_3 = vmovn_s32(vshlq_s32(vaddq_s32(vaddq_s32(E3, O3), add2), sh2)); - int16x4_t v_src_4 = vmovn_s32(vshlq_s32(vaddq_s32(vsubq_s32(E0, O0), add2), sh2)); - int16x4_t v_src_5 = vmovn_s32(vshlq_s32(vaddq_s32(vsubq_s32(E1, O1), add2), sh2)); - int16x4_t v_src_6 = vmovn_s32(vshlq_s32(vaddq_s32(vsubq_s32(E2, O2), add2), sh2)); - int16x4_t v_src_7 = vmovn_s32(vshlq_s32(vaddq_s32(vsubq_s32(E3, O3), add2), sh2)); - int16x4_t v_src_8 = vmovn_s32(vshlq_s32(vaddq_s32(vaddq_s32(E4, O4), add2), sh2)); - int16x4_t v_src_9 = vmovn_s32(vshlq_s32(vaddq_s32(vaddq_s32(E5, O5), add2), sh2)); - int16x4_t v_src_10 = vmovn_s32(vshlq_s32(vaddq_s32(vaddq_s32(E6, O6), add2), sh2)); - int16x4_t v_src_11 = vmovn_s32(vshlq_s32(vaddq_s32(vaddq_s32(E7, O7), add2), sh2)); - int16x4_t v_src_12 = vmovn_s32(vshlq_s32(vaddq_s32(vsubq_s32(E4, O4), add2), sh2)); - int16x4_t v_src_13 = vmovn_s32(vshlq_s32(vaddq_s32(vsubq_s32(E5, O5), add2), sh2)); - int16x4_t v_src_14 = vmovn_s32(vshlq_s32(vaddq_s32(vsubq_s32(E6, O6), add2), sh2)); - int16x4_t v_src_15 = vmovn_s32(vshlq_s32(vaddq_s32(vsubq_s32(E7, O7), add2), sh2)); + int16x4_t v_src_0 = vmovn_s32(vrshlq_s32(vaddq_s32(E0, O0), sh2)); + int16x4_t v_src_1 = vmovn_s32(vrshlq_s32(vaddq_s32(E1, O1), sh2)); + int16x4_t v_src_2 = vmovn_s32(vrshlq_s32(vaddq_s32(E2, O2), sh2)); + int16x4_t v_src_3 = vmovn_s32(vrshlq_s32(vaddq_s32(E3, O3), sh2)); + int16x4_t v_src_4 = vmovn_s32(vrshlq_s32(vsubq_s32(E0, O0), sh2)); + int16x4_t v_src_5 = vmovn_s32(vrshlq_s32(vsubq_s32(E1, O1), sh2)); + int16x4_t v_src_6 = vmovn_s32(vrshlq_s32(vsubq_s32(E2, O2), sh2)); + int16x4_t v_src_7 = vmovn_s32(vrshlq_s32(vsubq_s32(E3, O3), sh2)); + int16x4_t v_src_8 = vmovn_s32(vrshlq_s32(vaddq_s32(E4, O4), sh2)); + int16x4_t v_src_9 = vmovn_s32(vrshlq_s32(vaddq_s32(E5, O5), sh2)); + int16x4_t v_src_10 = vmovn_s32(vrshlq_s32(vaddq_s32(E6, O6), sh2)); + int16x4_t v_src_11 = vmovn_s32(vrshlq_s32(vaddq_s32(E7, O7), sh2)); + int16x4_t v_src_12 = vmovn_s32(vrshlq_s32(vsubq_s32(E4, O4), sh2)); + int16x4_t v_src_13 = vmovn_s32(vrshlq_s32(vsubq_s32(E5, O5), sh2)); + int16x4_t v_src_14 = vmovn_s32(vrshlq_s32(vsubq_s32(E6, O6), sh2)); + int16x4_t v_src_15 = vmovn_s32(vrshlq_s32(vsubq_s32(E7, O7), sh2)); int16x4_t t0 = vzip1_s16(v_src_0, v_src_1); int16x4_t t1 = vzip1_s16(v_src_2, v_src_3); @@ -485,7 +496,7 @@ const oapv_fn_itx_t oapv_tbl_fn_itx_neon[2] = NULL }; -static int oapv_quant_neon(s16* coef, u8 qp, int q_matrix[OAPV_BLK_D], int log2_w, int log2_h, int bit_depth, int deadzone_offset) +static int oapv_quant_neon(s16* coef, u8 qp, const int q_matrix[OAPV_BLK_D], int log2_w, int log2_h, int bit_depth, int deadzone_offset) { s64 offset; int shift; diff --git a/src/neon/oapv_tq_neon.h b/src/neon/oapv_tq_neon.h index 342372a..a1483ce 100644 --- a/src/neon/oapv_tq_neon.h +++ b/src/neon/oapv_tq_neon.h @@ -44,87 +44,6 @@ extern const oapv_fn_quant_t oapv_tbl_fn_quant_neon[2]; extern const oapv_fn_dquant_t oapv_tbl_fn_dquant_neon[2]; extern const oapv_fn_itx_t oapv_tbl_fn_itx_neon[2]; -#define CALCU_2x8(c0, c1, d0, d1) \ - v0 = _mm256_madd_epi16(s0, c0); \ - v1 = _mm256_madd_epi16(s1, c0); \ - v2 = _mm256_madd_epi16(s2, c0); \ - v3 = _mm256_madd_epi16(s3, c0); \ - v4 = _mm256_madd_epi16(s0, c1); \ - v5 = _mm256_madd_epi16(s1, c1); \ - v6 = _mm256_madd_epi16(s2, c1); \ - v7 = _mm256_madd_epi16(s3, c1); \ - v0 = _mm256_hadd_epi32(v0, v1); \ - v2 = _mm256_hadd_epi32(v2, v3); \ - v4 = _mm256_hadd_epi32(v4, v5); \ - v6 = _mm256_hadd_epi32(v6, v7); \ - d0 = _mm256_hadd_epi32(v0, v2); \ - d1 = _mm256_hadd_epi32(v4, v6) - -#define CALCU_2x8_ADD_SHIFT(d0, d1, d2, d3, add, shift) \ - d0 = _mm256_add_epi32(d0, add); \ - d1 = _mm256_add_epi32(d1, add); \ - d2 = _mm256_add_epi32(d2, add); \ - d3 = _mm256_add_epi32(d3, add); \ - d0 = _mm256_srai_epi32(d0, shift); \ - d1 = _mm256_srai_epi32(d1, shift); \ - d2 = _mm256_srai_epi32(d2, shift); \ - d3 = _mm256_srai_epi32(d3, shift); - -#define CALCU_2x4(c0, c1, c2, c3, d0, d1) \ - v0 = _mm256_madd_epi16(s0, c0); \ - v1 = _mm256_madd_epi16(s1, c0); \ - v2 = _mm256_madd_epi16(s0, c1); \ - v3 = _mm256_madd_epi16(s1, c1); \ - v4 = _mm256_madd_epi16(s0, c2); \ - v5 = _mm256_madd_epi16(s1, c2); \ - v6 = _mm256_madd_epi16(s0, c3); \ - v7 = _mm256_madd_epi16(s1, c3); \ - v0 = _mm256_hadd_epi32(v0, v1); \ - v2 = _mm256_hadd_epi32(v2, v3); \ - v4 = _mm256_hadd_epi32(v4, v5); \ - v6 = _mm256_hadd_epi32(v6, v7); \ - d0 = _mm256_hadd_epi32(v0, v2); \ - d1 = _mm256_hadd_epi32(v4, v6); \ - d0 = _mm256_permute4x64_epi64(d0, 0xd8); \ - d1 = _mm256_permute4x64_epi64(d1, 0xd8) - -#define CALCU_LINE_1x8(coeff0, dst) \ - v0 = _mm256_madd_epi16(s00, coeff0); \ - v1 = _mm256_madd_epi16(s01, coeff0); \ - v2 = _mm256_madd_epi16(s02, coeff0); \ - v3 = _mm256_madd_epi16(s03, coeff0); \ - v4 = _mm256_madd_epi16(s04, coeff0); \ - v5 = _mm256_madd_epi16(s05, coeff0); \ - v6 = _mm256_madd_epi16(s06, coeff0); \ - v7 = _mm256_madd_epi16(s07, coeff0); \ - v0 = _mm256_hadd_epi32(v0, v1); \ - v2 = _mm256_hadd_epi32(v2, v3); \ - v4 = _mm256_hadd_epi32(v4, v5); \ - v6 = _mm256_hadd_epi32(v6, v7); \ - v0 = _mm256_hadd_epi32(v0, v2); \ - v4 = _mm256_hadd_epi32(v4, v6); \ - v1 = _mm256_permute2x128_si256(v0, v4, 0x20); \ - v2 = _mm256_permute2x128_si256(v0, v4, 0x31); \ - dst = _mm256_add_epi32(v1, v2) - -#define CALCU_LINE_1x8_ADD_SHIFT(d0, d1, d2, d3, d4, d5, d6, d7, add, shift) \ - d0 = _mm256_add_epi32(d0, add); \ - d1 = _mm256_add_epi32(d1, add); \ - d2 = _mm256_add_epi32(d2, add); \ - d3 = _mm256_add_epi32(d3, add); \ - d4 = _mm256_add_epi32(d4, add); \ - d5 = _mm256_add_epi32(d5, add); \ - d6 = _mm256_add_epi32(d6, add); \ - d7 = _mm256_add_epi32(d7, add); \ - d0 = _mm256_srai_epi32(d0, shift); \ - d1 = _mm256_srai_epi32(d1, shift); \ - d2 = _mm256_srai_epi32(d2, shift); \ - d3 = _mm256_srai_epi32(d3, shift); \ - d4 = _mm256_srai_epi32(d4, shift); \ - d5 = _mm256_srai_epi32(d5, shift); \ - d6 = _mm256_srai_epi32(d6, shift); \ - d7 = _mm256_srai_epi32(d7, shift) - #endif // ARM_NEON /////////////////////////////////////////////////////////////////////////////// diff --git a/src/oapv.c b/src/oapv.c index 164d334..83ac42f 100644 --- a/src/oapv.c +++ b/src/oapv.c @@ -1227,7 +1227,7 @@ static int enc_platform_init(oapve_ctx_t *ctx) ctx->fn_itx = oapv_tbl_fn_itx_neon; ctx->fn_txb = oapv_tbl_fn_txb_neon; ctx->fn_quant = oapv_tbl_fn_quant_neon; - ctx->fn_had8x8 = oapv_dc_removed_had8x8; + ctx->fn_had8x8 = oapv_dc_removed_had8x8_neon; #endif return OAPV_OK; }