Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 8 additions & 5 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,6 @@ check_include_file("byteswap.h" HAVE_BYTESWAP_H)
check_include_file("inttypes.h" HAVE_INTTYPES_H)
check_include_file("stdint.h" HAVE_STDINT_H)
check_include_file("stdbool.h" HAVE_STDBOOL_H)
check_include_file("arm_neon.h" FLAC__HAS_NEONINTRIN)

check_include_file("threads.h" HAVE_THREADS_H)
if(MSVC AND CMAKE_C_STANDARD GREATER_EQUAL 11)
Expand All @@ -128,10 +127,14 @@ if(NOT HAVE_STDINT_H OR NOT HAVE_STDBOOL_H)
message(SEND_ERROR "Header stdint.h and/or stdbool.h not found")
endif()

if(MSVC)
check_include_file("intrin.h" FLAC__HAS_X86INTRIN)
else()
check_include_file("x86intrin.h" FLAC__HAS_X86INTRIN)
check_include_file("arm_neon.h" FLAC__HAS_NEONINTRIN)
if(!FLAC__HAS_NEONINTRIN)
# Check for x86 after checking for ARM - treat ARM64EC as ARM not x86
if(MSVC)
check_include_file("intrin.h" FLAC__HAS_X86INTRIN)
else()
check_include_file("x86intrin.h" FLAC__HAS_X86INTRIN)
endif()
endif()


Expand Down
2 changes: 1 addition & 1 deletion cmake/CheckCPUArch.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -23,5 +23,5 @@ macro(CHECK_CPU_ARCH_X86 VARIABLE)
endmacro(CHECK_CPU_ARCH_X86)

macro(CHECK_CPU_ARCH_ARM64 VARIABLE)
_CHECK_CPU_ARCH(arm64 "defined(__aarch64__) || defined(__arm64__)" ${VARIABLE})
_CHECK_CPU_ARCH(arm64 "defined(__aarch64__) || defined(__arm64__) || defined(_M_ARM64) || defined(_M_ARM64EC)" ${VARIABLE})
endmacro(CHECK_CPU_ARCH_ARM64)
17 changes: 9 additions & 8 deletions src/libFLAC/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,15 @@ include(CheckCSourceCompiles)
include(CheckCPUArch)
include(CheckA64NEON)

check_cpu_arch_x64(FLAC__CPU_X86_64)
if(NOT FLAC__CPU_X86_64)
check_cpu_arch_x86(FLAC__CPU_IA32)
check_cpu_arch_arm64(FLAC__CPU_ARM64)
if(FLAC__CPU_ARM64)
check_a64neon(FLAC__HAS_A64NEONINTRIN)
else()
# Check for x86 after checking for ARM - treat ARM64EC as ARM not x86
check_cpu_arch_x64(FLAC__CPU_X86_64)
if(NOT FLAC__CPU_X86_64)
check_cpu_arch_x86(FLAC__CPU_IA32)
endif()
endif()

if(FLAC__CPU_X86_64 OR FLAC__CPU_IA32)
Expand All @@ -21,11 +27,6 @@ if(FLAC__CPU_X86_64 OR FLAC__CPU_IA32)
if(WITH_AVX AND MSVC)
set_source_files_properties(fixed_intrin_avx2.c lpc_intrin_avx2.c stream_encoder_intrin_avx2.c lpc_intrin_fma.c PROPERTIES COMPILE_FLAGS /arch:AVX2)
endif()
else()
check_cpu_arch_arm64(FLAC__CPU_ARM64)
if(FLAC__CPU_ARM64)
check_a64neon(FLAC__HAS_A64NEONINTRIN)
endif()
endif()

if(NOT WITH_ASM)
Expand Down
100 changes: 54 additions & 46 deletions src/libFLAC/lpc_intrin_neon.c
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,14 @@ void FLAC__lpc_compute_autocorrelation_intrin_neon_lag_8(const FLAC__real data[]

#endif /* ifdef FLAC__HAS_A64NEONINTRIN */

static inline int32x4_t load_int32x4(int32_t a, int32_t b, int32_t c, int32_t d) {
#ifdef _MSC_VER // MSVC does not support aggregate initializer of Neon types
int32_t temp[] = {a,b,c,d};
return vld1q_s32(temp);
#else
return {a,b,c,d};
#endif
}

#define MUL_32_BIT_LOOP_UNROOL_3(qlp_coeff_vec, lane) \
summ_0 = vmulq_laneq_s32(tmp_vec[0], qlp_coeff_vec, lane); \
Expand All @@ -91,9 +99,9 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_neon(const FLAC__in
if(order > 8) {
if(order > 10) {
if (order == 12) {
int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], qlp_coeff[3]};
int32x4_t qlp_coeff_1 = {qlp_coeff[4], qlp_coeff[5], qlp_coeff[6], qlp_coeff[7]};
int32x4_t qlp_coeff_2 = {qlp_coeff[8], qlp_coeff[9], qlp_coeff[10], qlp_coeff[11]};
int32x4_t qlp_coeff_0 = vld1q_s32(qlp_coeff);
int32x4_t qlp_coeff_1 = vld1q_s32(qlp_coeff + 4);
int32x4_t qlp_coeff_2 = vld1q_s32(qlp_coeff + 8);

tmp_vec[0] = vld1q_s32(data - 12);
tmp_vec[1] = vld1q_s32(data - 11);
Expand Down Expand Up @@ -150,9 +158,9 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_neon(const FLAC__in
}

else { /* order == 11 */
int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], qlp_coeff[3]};
int32x4_t qlp_coeff_1 = {qlp_coeff[4], qlp_coeff[5], qlp_coeff[6], qlp_coeff[7]};
int32x4_t qlp_coeff_2 = {qlp_coeff[8], qlp_coeff[9], qlp_coeff[10], 0};
int32x4_t qlp_coeff_0 = vld1q_s32(qlp_coeff);
int32x4_t qlp_coeff_1 = vld1q_s32(qlp_coeff + 4);
int32x4_t qlp_coeff_2 = load_int32x4(qlp_coeff[8], qlp_coeff[9], qlp_coeff[10], 0);

tmp_vec[0] = vld1q_s32(data - 11);
tmp_vec[1] = vld1q_s32(data - 10);
Expand Down Expand Up @@ -208,9 +216,9 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_neon(const FLAC__in
}
else {
if(order == 10) {
int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], qlp_coeff[3]};
int32x4_t qlp_coeff_1 = {qlp_coeff[4], qlp_coeff[5], qlp_coeff[6], qlp_coeff[7]};
int32x4_t qlp_coeff_2 = {qlp_coeff[8], qlp_coeff[9], 0, 0};
int32x4_t qlp_coeff_0 = vld1q_s32(qlp_coeff);
int32x4_t qlp_coeff_1 = vld1q_s32(qlp_coeff + 4);
int32x4_t qlp_coeff_2 = load_int32x4(qlp_coeff[8], qlp_coeff[9], 0, 0);

tmp_vec[0] = vld1q_s32(data - 10);
tmp_vec[1] = vld1q_s32(data - 9);
Expand Down Expand Up @@ -261,9 +269,9 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_neon(const FLAC__in
}
}
else { /* order == 9 */
int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], qlp_coeff[3]};
int32x4_t qlp_coeff_1 = {qlp_coeff[4], qlp_coeff[5], qlp_coeff[6], qlp_coeff[7]};
int32x4_t qlp_coeff_2 = {qlp_coeff[8], 0, 0, 0};
int32x4_t qlp_coeff_0 = vld1q_s32(qlp_coeff);
int32x4_t qlp_coeff_1 = vld1q_s32(qlp_coeff + 4);
int32x4_t qlp_coeff_2 = load_int32x4(qlp_coeff[8], 0, 0, 0);

tmp_vec[0] = vld1q_s32(data - 9);
tmp_vec[1] = vld1q_s32(data - 8);
Expand Down Expand Up @@ -313,8 +321,8 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_neon(const FLAC__in
else if(order > 4) {
if(order > 6) {
if(order == 8) {
int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], qlp_coeff[3]};
int32x4_t qlp_coeff_1 = {qlp_coeff[4], qlp_coeff[5], qlp_coeff[6], qlp_coeff[7]};
int32x4_t qlp_coeff_0 = vld1q_s32(qlp_coeff);
int32x4_t qlp_coeff_1 = vld1q_s32(qlp_coeff + 4);

tmp_vec[0] = vld1q_s32(data - 8);
tmp_vec[1] = vld1q_s32(data - 7);
Expand Down Expand Up @@ -357,8 +365,8 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_neon(const FLAC__in
}
}
else { /* order == 7 */
int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], qlp_coeff[3]};
int32x4_t qlp_coeff_1 = {qlp_coeff[4], qlp_coeff[5], qlp_coeff[6], 0};
int32x4_t qlp_coeff_0 = vld1q_s32(qlp_coeff);
int32x4_t qlp_coeff_1 = load_int32x4(qlp_coeff[4], qlp_coeff[5], qlp_coeff[6], 0);

tmp_vec[0] = vld1q_s32(data - 7);
tmp_vec[1] = vld1q_s32(data - 6);
Expand Down Expand Up @@ -400,8 +408,8 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_neon(const FLAC__in
}
else {
if(order == 6) {
int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], qlp_coeff[3]};
int32x4_t qlp_coeff_1 = {qlp_coeff[4], qlp_coeff[5], 0, 0};
int32x4_t qlp_coeff_0 = vld1q_s32(qlp_coeff);
int32x4_t qlp_coeff_1 = load_int32x4(qlp_coeff[4], qlp_coeff[5], 0, 0);

tmp_vec[0] = vld1q_s32(data - 6);
tmp_vec[1] = vld1q_s32(data - 5);
Expand Down Expand Up @@ -438,8 +446,8 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_neon(const FLAC__in
}
}
else { /* order == 5 */
int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], qlp_coeff[3]};
int32x4_t qlp_coeff_1 = {qlp_coeff[4], 0, 0, 0};
int32x4_t qlp_coeff_0 = vld1q_s32(qlp_coeff);
int32x4_t qlp_coeff_1 = load_int32x4(qlp_coeff[4], 0, 0, 0);

tmp_vec[0] = vld1q_s32(data - 5);

Expand Down Expand Up @@ -478,7 +486,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_neon(const FLAC__in
else {
if(order > 2) {
if(order == 4) {
int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], qlp_coeff[3]};
int32x4_t qlp_coeff_0 = vld1q_s32(qlp_coeff);

for (i = 0; i < (int)data_len - 11; i += 12)
{
Expand Down Expand Up @@ -507,7 +515,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_neon(const FLAC__in
}
}
else { /* order == 3 */
int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], 0};
int32x4_t qlp_coeff_0 = load_int32x4(qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], 0);

for (i = 0; i < (int)data_len - 11; i += 12)
{
Expand All @@ -534,7 +542,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_neon(const FLAC__in
}
else {
if(order == 2) {
int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], 0, 0};
int32x4_t qlp_coeff_0 = load_int32x4(qlp_coeff[0], qlp_coeff[1], 0, 0);

for (i = 0; i < (int)data_len - 11; i += 12)
{
Expand Down Expand Up @@ -679,9 +687,9 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_neon(const FLA
if(order > 8) {
if(order > 10) {
if(order == 12) {
int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], qlp_coeff[3]};
int32x4_t qlp_coeff_1 = {qlp_coeff[4],qlp_coeff[5],qlp_coeff[6],qlp_coeff[7]};
int32x4_t qlp_coeff_2 = {qlp_coeff[8],qlp_coeff[9],qlp_coeff[10],qlp_coeff[11]};
int32x4_t qlp_coeff_0 = vld1q_s32(qlp_coeff);
int32x4_t qlp_coeff_1 = vld1q_s32(qlp_coeff + 4);
int32x4_t qlp_coeff_2 = vld1q_s32(qlp_coeff + 8);

tmp_vec[0] = vld1q_s32(data - 12);
tmp_vec[1] = vld1q_s32(data - 11);
Expand Down Expand Up @@ -735,9 +743,9 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_neon(const FLA
}
}
else { /* order == 11 */
int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], qlp_coeff[3]};
int32x4_t qlp_coeff_1 = {qlp_coeff[4],qlp_coeff[5],qlp_coeff[6],qlp_coeff[7]};
int32x4_t qlp_coeff_2 = {qlp_coeff[8],qlp_coeff[9],qlp_coeff[10],0};
int32x4_t qlp_coeff_0 = vld1q_s32(qlp_coeff);
int32x4_t qlp_coeff_1 = vld1q_s32(qlp_coeff + 4);
int32x4_t qlp_coeff_2 = load_int32x4(qlp_coeff[8],qlp_coeff[9],qlp_coeff[10],0);

tmp_vec[0] = vld1q_s32(data - 11);
tmp_vec[1] = vld1q_s32(data - 10);
Expand Down Expand Up @@ -791,9 +799,9 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_neon(const FLA
else
{
if (order == 10) {
int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], qlp_coeff[3]};
int32x4_t qlp_coeff_1 = {qlp_coeff[4], qlp_coeff[5], qlp_coeff[6], qlp_coeff[7]};
int32x4_t qlp_coeff_2 = {qlp_coeff[8], qlp_coeff[9], 0, 0};
int32x4_t qlp_coeff_0 = vld1q_s32(qlp_coeff);
int32x4_t qlp_coeff_1 = vld1q_s32(qlp_coeff + 4);
int32x4_t qlp_coeff_2 = load_int32x4(qlp_coeff[8], qlp_coeff[9], 0, 0);

tmp_vec[0] = vld1q_s32(data - 10);
tmp_vec[1] = vld1q_s32(data - 9);
Expand Down Expand Up @@ -843,9 +851,9 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_neon(const FLA
}

else /* order == 9 */ {
int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], qlp_coeff[3]};
int32x4_t qlp_coeff_1 = {qlp_coeff[4], qlp_coeff[5], qlp_coeff[6], qlp_coeff[7]};
int32x4_t qlp_coeff_2 = {qlp_coeff[8], 0, 0, 0};
int32x4_t qlp_coeff_0 = vld1q_s32(qlp_coeff);
int32x4_t qlp_coeff_1 = vld1q_s32(qlp_coeff + 4);
int32x4_t qlp_coeff_2 = load_int32x4(qlp_coeff[8], 0, 0, 0);

tmp_vec[0] = vld1q_s32(data - 9);
tmp_vec[1] = vld1q_s32(data - 8);
Expand Down Expand Up @@ -897,8 +905,8 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_neon(const FLA
{
if (order == 8)
{
int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], qlp_coeff[3]};
int32x4_t qlp_coeff_1 = {qlp_coeff[4], qlp_coeff[5], qlp_coeff[6], qlp_coeff[7]};
int32x4_t qlp_coeff_0 = vld1q_s32(qlp_coeff);
int32x4_t qlp_coeff_1 = vld1q_s32(qlp_coeff + 4);

tmp_vec[0] = vld1q_s32(data - 8);
tmp_vec[1] = vld1q_s32(data - 7);
Expand Down Expand Up @@ -942,8 +950,8 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_neon(const FLA
}
else /* order == 7 */
{
int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], qlp_coeff[3]};
int32x4_t qlp_coeff_1 = {qlp_coeff[4], qlp_coeff[5], qlp_coeff[6], 0};
int32x4_t qlp_coeff_0 = vld1q_s32(qlp_coeff);
int32x4_t qlp_coeff_1 = load_int32x4(qlp_coeff[4], qlp_coeff[5], qlp_coeff[6], 0);

tmp_vec[0] = vld1q_s32(data - 7);
tmp_vec[1] = vld1q_s32(data - 6);
Expand Down Expand Up @@ -986,8 +994,8 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_neon(const FLA
else
{
if (order == 6) {
int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], qlp_coeff[3]};
int32x4_t qlp_coeff_1 = {qlp_coeff[4], qlp_coeff[5], 0, 0};
int32x4_t qlp_coeff_0 = vld1q_s32(qlp_coeff);
int32x4_t qlp_coeff_1 = load_int32x4(qlp_coeff[4], qlp_coeff[5], 0, 0);

tmp_vec[0] = vld1q_s32(data - 6);
tmp_vec[1] = vld1q_s32(data - 5);
Expand Down Expand Up @@ -1026,8 +1034,8 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_neon(const FLA

else
{ /* order == 5 */
int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], qlp_coeff[3]};
int32x4_t qlp_coeff_1 = {qlp_coeff[4], 0, 0, 0};
int32x4_t qlp_coeff_0 = vld1q_s32(qlp_coeff);
int32x4_t qlp_coeff_1 = load_int32x4(qlp_coeff[4], 0, 0, 0);

tmp_vec[0] = vld1q_s32(data - 5);

Expand Down Expand Up @@ -1066,7 +1074,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_neon(const FLA
{
if (order == 4)
{
int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], qlp_coeff[3]};
int32x4_t qlp_coeff_0 = vld1q_s32(qlp_coeff);

for (i = 0; i < (int)data_len - 11; i += 12)
{
Expand Down Expand Up @@ -1095,7 +1103,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_neon(const FLA
else
{ /* order == 3 */

int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], 0};
int32x4_t qlp_coeff_0 = load_int32x4(qlp_coeff[0], qlp_coeff[1], qlp_coeff[2], 0);

for (i = 0; i < (int)data_len - 11; i += 12)
{
Expand All @@ -1122,7 +1130,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_neon(const FLA
{
if (order == 2)
{
int32x4_t qlp_coeff_0 = {qlp_coeff[0], qlp_coeff[1], 0, 0};
int32x4_t qlp_coeff_0 = load_int32x4(qlp_coeff[0], qlp_coeff[1], 0, 0);

for (i = 0; i < (int)data_len - 11; i += 12)
{
Expand Down