Skip to content

Commit c332d7a

Browse files
committed
Implemented AArch64 ASIMD optimized lanczos function generator
1 parent 0b6090c commit c332d7a

File tree

6 files changed

+213
-6
lines changed

6 files changed

+213
-6
lines changed

include/private/dsp/arch/aarch64/asimd/pmath.h

+1
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
#include <private/dsp/arch/aarch64/asimd/pmath/exp.h>
3232
#include <private/dsp/arch/aarch64/asimd/pmath/fmop_kx.h>
3333
#include <private/dsp/arch/aarch64/asimd/pmath/fmop_vv.h>
34+
#include <private/dsp/arch/aarch64/asimd/pmath/lanczos.h>
3435
#include <private/dsp/arch/aarch64/asimd/pmath/log.h>
3536
#include <private/dsp/arch/aarch64/asimd/pmath/lramp.h>
3637
#include <private/dsp/arch/aarch64/asimd/pmath/minmax.h>
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,204 @@
1+
/*
2+
* Copyright (C) 2025 Linux Studio Plugins Project <https://lsp-plug.in/>
3+
* (C) 2025 Vladimir Sadovnikov <[email protected]>
4+
*
5+
* This file is part of lsp-dsp-lib
6+
* Created on: 22 апр. 2025 г.
7+
*
8+
* lsp-dsp-lib is free software: you can redistribute it and/or modify
9+
* it under the terms of the GNU Lesser General Public License as published by
10+
* the Free Software Foundation, either version 3 of the License, or
11+
* any later version.
12+
*
13+
* lsp-dsp-lib is distributed in the hope that it will be useful,
14+
* but WITHOUT ANY WARRANTY; without even the implied warranty of
15+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16+
* GNU Lesser General Public License for more details.
17+
*
18+
* You should have received a copy of the GNU Lesser General Public License
19+
* along with lsp-dsp-lib. If not, see <https://www.gnu.org/licenses/>.
20+
*/
21+
22+
#ifndef PRIVATE_DSP_ARCH_AARCH64_ASIMD_PMATH_LANCZOS_H_
23+
#define PRIVATE_DSP_ARCH_AARCH64_ASIMD_PMATH_LANCZOS_H_
24+
25+
#ifndef PRIVATE_DSP_ARCH_AARCH64_ASIMD_IMPL
26+
#error "This header should not be included directly"
27+
#endif /* PRIVATE_DSP_ARCH_AARCH64_ASIMD_IMPL */
28+
29+
#include <private/dsp/arch/aarch64/asimd/pmath/sin.h>
30+
31+
namespace lsp
32+
{
33+
namespace asimd
34+
{
35+
IF_ARCH_AARCH64(
36+
static const uint32_t lanczos_const[] __lsp_aligned16 =
37+
{
38+
LSP_DSP_VEC4(0x358637bd), // +0x00: Sinc threshold = 1e-6
39+
LSP_DSP_VEC4(0x3f800000), // +0x10: 1.0
40+
};
41+
)
42+
43+
typedef struct lanczos_gen_t
44+
{
45+
float x1[8]; // +0x00: Computed X1
46+
float n[8]; // +0x20: Numerator
47+
} lanczos_gen_t;
48+
49+
#define LANCZOS_GEN_FUNC_X8 \
50+
/* v0 = x1[0] */ \
51+
/* v1 = x1[1] */ \
52+
__ASM_EMIT("fmul v2.4s, v0.4s, v27.4s") /* v2 = x2 = x1*a */ \
53+
__ASM_EMIT("fmul v3.4s, v1.4s, v27.4s") \
54+
__ASM_EMIT("fabs v4.4s, v0.4s") /* v4 = fabsf(x1) */ \
55+
__ASM_EMIT("fabs v5.4s, v1.4s") \
56+
__ASM_EMIT("fmul v8.4s, v0.4s, v2.4s") /* v8 = x1*x2 */ \
57+
__ASM_EMIT("fmul v9.4s, v1.4s, v3.4s") \
58+
__ASM_EMIT("frecpe v10.4s, v8.4s") /* v10 = s2 */ \
59+
__ASM_EMIT("frecpe v11.4s, v9.4s") \
60+
__ASM_EMIT("frecps v12.4s, v10.4s, v8.4s") /* v12 = (2 - R*s2) */ \
61+
__ASM_EMIT("frecps v13.4s, v11.4s, v9.4s") \
62+
__ASM_EMIT("fmul v10.4s, v12.4s, v10.4s") /* v10 = s2' = s2 * (2 - R*s2) */ \
63+
__ASM_EMIT("fmul v11.4s, v13.4s, v11.4s") \
64+
__ASM_EMIT("frecps v12.4s, v10.4s, v8.4s") /* v12 = (2 - R*s2') */ \
65+
__ASM_EMIT("frecps v13.4s, v11.4s, v9.4s") \
66+
__ASM_EMIT("fmul v8.4s, v12.4s, v10.4s") /* v8 = n = 1/(x1*x2) */ \
67+
__ASM_EMIT("fmul v9.4s, v13.4s, v11.4s") \
68+
__ASM_EMIT("fadd v0.4s, v0.4s, v16.4s") /* v0 = x1 + PI/2 */ \
69+
__ASM_EMIT("fadd v1.4s, v1.4s, v16.4s") \
70+
__ASM_EMIT("fadd v2.4s, v2.4s, v16.4s") /* v2 = x2 + PI/2 */ \
71+
__ASM_EMIT("fadd v3.4s, v3.4s, v16.4s") \
72+
__ASM_EMIT("stp q4, q5, [%[state], 0x00]") /* save fabfs(x1) */ \
73+
__ASM_EMIT("stp q8, q9, [%[state], 0x20]") /* save n */ \
74+
SINF_X_PLUS_PI_2_CORE_X16 \
75+
/* v0 = sinf(x1[0]), v1 = sinf(x1[1]) */ \
76+
/* v2 = sinf(x2[0]), v3 = sinf(x2[1]) */ \
77+
__ASM_EMIT("ldp q4, q5, [%[state], 0x00]") /* v4 = fabsf(x1) */ \
78+
__ASM_EMIT("ldp q6, q7, [%[state], 0x20]") /* v6 = n = 1/(x1*x2) */ \
79+
__ASM_EMIT("ldp q8, q9, [%[LC]]") /* v8 = 1e-6, v9 = 1.0 */ \
80+
__ASM_EMIT("fmul v0.4s, v0.4s, v2.4s") /* v0 = sinf(x1)*sinf(x2) */ \
81+
__ASM_EMIT("fmul v1.4s, v1.4s, v3.4s") \
82+
__ASM_EMIT("fcmge v2.4s, v4.4s, v8.4s") /* v2 = [ fabsf(x1) >= 1e-6 ] */ \
83+
__ASM_EMIT("fcmge v3.4s, v5.4s, v8.4s") \
84+
__ASM_EMIT("fmul v0.4s, v0.4s, v6.4s") /* v6 = sinf(x1)*sinf(x2)/(x1*x2) */ \
85+
__ASM_EMIT("fmul v1.4s, v1.4s, v7.4s") \
86+
__ASM_EMIT("fcmgt v4.4s, v26.4s, v4.4s") /* v4 = [ fabsf(x1) < t ] */ \
87+
__ASM_EMIT("fcmgt v5.4s, v26.4s, v5.4s") \
88+
__ASM_EMIT("bif v0.16b, v9.16b, v2.16b") /* v0 = [ fabsf(x1) >= 1e-6 ] ? f : 1.0 */ \
89+
__ASM_EMIT("bif v1.16b, v9.16b, v3.16b") \
90+
__ASM_EMIT("and v0.16b, v0.16b, v4.16b") /* v0 = [ fabsf(x1) < t ] ? ([ fabsf(x1) >= 1e-6 ] ? f : 1.0) : 0.0 */ \
91+
__ASM_EMIT("and v1.16b, v1.16b, v5.16b")
92+
93+
#define LANCZOS_GEN_FUNC_X4 \
94+
/* v0 = x1 */ \
95+
__ASM_EMIT("fmul v1.4s, v0.4s, v27.4s") /* v1 = x2 = x1*a */ \
96+
__ASM_EMIT("fabs v2.4s, v0.4s") /* v2 = fabsf(x1) */ \
97+
__ASM_EMIT("fmul v3.4s, v0.4s, v1.4s") /* v3 = x1*x2 */ \
98+
__ASM_EMIT("frecpe v10.4s, v3.4s") /* v10 = s2 */ \
99+
__ASM_EMIT("frecps v12.4s, v10.4s, v3.4s") /* v12 = (2 - R*s2) */ \
100+
__ASM_EMIT("fmul v10.4s, v12.4s, v10.4s") /* v10 = s2' = s2 * (2 - R*s2) */ \
101+
__ASM_EMIT("frecps v12.4s, v10.4s, v3.4s") /* v12 = (2 - R*s2') */ \
102+
__ASM_EMIT("fmul v3.4s, v12.4s, v10.4s") /* v3 = n = 1/(x1*x2) */ \
103+
__ASM_EMIT("fadd v0.4s, v0.4s, v16.4s") /* v0 = x1 + PI/2 */ \
104+
__ASM_EMIT("fadd v1.4s, v1.4s, v16.4s") /* v1 = x2 + PI/2 */ \
105+
SINF_X_PLUS_PI_2_CORE_X8 \
106+
/* v0 = sinf(x1), v1 = sinf(x2) */ \
107+
__ASM_EMIT("ldp q8, q9, [%[LC]]") /* v8 = 1e-6, v9 = 1.0 */ \
108+
__ASM_EMIT("fmul v0.4s, v0.4s, v1.4s") /* v0 = sinf(x1)*sinf(x2) */ \
109+
__ASM_EMIT("fcmge v4.4s, v2.4s, v8.4s") /* v4 = [ fabsf(x1) >= 1e-6 ] */ \
110+
__ASM_EMIT("fmul v0.4s, v0.4s, v3.4s") /* v0 = sinf(x1)*sinf(x2)/(x1*x2) */ \
111+
__ASM_EMIT("fcmgt v5.4s, v26.4s, v2.4s") /* v5 = [ fabsf(x1) < t ] */ \
112+
__ASM_EMIT("bif v0.16b, v9.16b, v4.16b") /* v0 = [ fabsf(x1) >= 1e-6 ] ? f : 1.0 */ \
113+
__ASM_EMIT("and v0.16b, v0.16b, v5.16b") /* v0 = [ fabsf(x1) < t ] ? ([ fabsf(x1) >= 1e-6 ] ? f : 1.0) : 0.0 */
114+
115+
116+
void lanczos1(float *dst, float k, float p, float t, float a, size_t count)
117+
{
118+
IF_ARCH_AARCH64(
119+
lanczos_gen_t state;
120+
);
121+
ARCH_AARCH64_ASM(
122+
__ASM_EMIT("ldp q16, q17, [%[S2C], #0x00]") /* v16 = PI/2, v17 = PI */
123+
__ASM_EMIT("ldp q18, q19, [%[S2C], #0x20]") /* v18 = 1/(2*PI), v19 = C0 */
124+
__ASM_EMIT("ldp q20, q21, [%[S2C], #0x40]") /* v20 = C1, v21 = C2 */
125+
__ASM_EMIT("ldp q22, q23, [%[S2C], #0x60]") /* v22 = C3, v23 = C4 */
126+
__ASM_EMIT("ld1r {v24.4s}, [%[k]]") /* v24 = k */
127+
__ASM_EMIT("ld1r {v25.4s}, [%[p]]") /* v25 = p */
128+
__ASM_EMIT("ld1r {v26.4s}, [%[t]]") /* v26 = t */
129+
__ASM_EMIT("ld1r {v27.4s}, [%[a]]") /* v27 = a */
130+
__ASM_EMIT("ldp q28, q29, [%[S2KP], #0x00]") /* v28 = i0 = 0 1 2 3, v29 = i1 = 4 5 6 7 */
131+
__ASM_EMIT("ldp q30, q31, [%[S2KP], #0x20]") /* v30 = s0 = 8 8 8 8, v31 = s1 = 4 4 4 4 */
132+
// x8 blocks
133+
__ASM_EMIT("subs %[count], %[count], #8")
134+
__ASM_EMIT("b.lo 2f")
135+
__ASM_EMIT("1:")
136+
__ASM_EMIT("fmul v0.4s, v28.4s, v24.4s") // v0 = k*i0
137+
__ASM_EMIT("fmul v1.4s, v29.4s, v24.4s") // v1 = k*i1
138+
__ASM_EMIT("fadd v28.4s, v28.4s, v30.4s") // v28 = i0 + step
139+
__ASM_EMIT("fadd v29.4s, v29.4s, v30.4s") // v29 = i1 + step
140+
__ASM_EMIT("fsub v0.4s, v0.4s, v25.4s") // v0 = k*i0 - p
141+
__ASM_EMIT("fsub v1.4s, v1.4s, v25.4s") // v1 = k*i1 - p
142+
LANCZOS_GEN_FUNC_X8
143+
__ASM_EMIT("subs %[count], %[count], #8")
144+
__ASM_EMIT("stp q0, q1, [%[dst], #0x00]")
145+
__ASM_EMIT("add %[dst], %[dst], #0x20")
146+
__ASM_EMIT("b.hs 1b")
147+
__ASM_EMIT("2:")
148+
// x4 block
149+
__ASM_EMIT("adds %[count], %[count], #4")
150+
__ASM_EMIT("b.lt 4f")
151+
__ASM_EMIT("fmul v0.4s, v28.4s, v24.4s") // v0 = k*i0
152+
__ASM_EMIT("fadd v28.4s, v28.4s, v31.4s") // v28 = i0 + step
153+
__ASM_EMIT("fsub v0.4s, v0.4s, v25.4s") // v0 = k*i0 - p
154+
LANCZOS_GEN_FUNC_X4
155+
__ASM_EMIT("sub %[count], %[count], #4")
156+
__ASM_EMIT("str q0, [%[dst], #0x00]")
157+
__ASM_EMIT("add %[dst], %[dst], #0x10")
158+
__ASM_EMIT("4:")
159+
// Tail: 1x-3x block
160+
__ASM_EMIT("adds %[count], %[count], #4")
161+
__ASM_EMIT("b.ls 8f")
162+
__ASM_EMIT("fmul v0.4s, v28.4s, v24.4s") // v0 = k*i0
163+
__ASM_EMIT("fsub v0.4s, v0.4s, v25.4s") // v0 = k*i0 - p
164+
LANCZOS_GEN_FUNC_X4
165+
__ASM_EMIT("tst %[count], #1")
166+
__ASM_EMIT("b.eq 6f")
167+
__ASM_EMIT("st1 {v0.s}[0], [%[dst]]")
168+
__ASM_EMIT("add %[dst], %[dst], #0x04")
169+
__ASM_EMIT("ext v0.16b, v0.16b, v0.16b, #4") // v0 = S1 S2 S3 S0
170+
__ASM_EMIT("6:")
171+
__ASM_EMIT("tst %[count], #2")
172+
__ASM_EMIT("b.eq 8f")
173+
__ASM_EMIT("st1 {v0.d}[0], [%[dst]]")
174+
// End
175+
__ASM_EMIT("8:")
176+
177+
: [dst] "+r" (dst), [count] "+r" (count)
178+
: [k] "r" (&k),
179+
[p] "r" (&p),
180+
[t] "r" (&t),
181+
[a] "r" (&a),
182+
[S2C] "r" (&sinf_const[0]),
183+
[S2KP] "r" (&kp_gen_const[0]),
184+
[LC] "r" (&lanczos_const[0]),
185+
[state] "r" (&state)
186+
: "cc", "memory",
187+
"v0", "v1", "v2", "v3",
188+
"v4", "v5", "v6", "v7",
189+
"v8", "v9", "v10", "v11",
190+
"v12", "v13", "v14", "v15",
191+
"v16", "v17", "v18", "v19",
192+
"v20", "v21", "v22", "v23",
193+
"v24", "v25", "v26", "v27",
194+
"v28", "v29", "v30", "v31"
195+
);
196+
}
197+
198+
} /* namespace asimd */
199+
} /* namespace lsp */
200+
201+
202+
203+
204+
#endif /* PRIVATE_DSP_ARCH_AARCH64_ASIMD_PMATH_LANCZOS_H_ */

include/private/dsp/arch/aarch64/asimd/pmath/sin.h

+2-2
Original file line numberDiff line numberDiff line change
@@ -55,8 +55,8 @@ namespace lsp
5555
#define SINF_X_PLUS_PI_2_CORE_X16 \
5656
/* v0 = X0 = x0 + PI/2 */ \
5757
/* v1 = X1 = x1 + PI/2 */ \
58-
/* v2 = X0 = x0 + PI/2 */ \
59-
/* v3 = X1 = x1 + PI/2 */ \
58+
/* v2 = X2 = x2 + PI/2 */ \
59+
/* v3 = X3 = x3 + PI/2 */ \
6060
/* v16 = PI/2 */ \
6161
/* v17 = PI */ \
6262
/* v18 = 1/(2*PI) */ \

src/main/aarch64/asimd.cpp

+2
Original file line numberDiff line numberDiff line change
@@ -299,6 +299,8 @@
299299
EXPORT1(sinf2);
300300
EXPORT1(sinf_kp1);
301301

302+
EXPORT1(lanczos1);
303+
302304
EXPORT1(sqr1);
303305
EXPORT1(sqr2);
304306
EXPORT1(ssqrt1);

src/test/ptest/pmath/lanczos1.cpp

+2-2
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ namespace lsp
7777
IF_ARCH_AARCH64(
7878
namespace asimd
7979
{
80-
// void lanczos1(float *dst, float k, float p, float t, float a, size_t count);
80+
void lanczos1(float *dst, float k, float p, float t, float a, size_t count);
8181
}
8282
)
8383

@@ -133,7 +133,7 @@ PTEST_BEGIN("dsp.pmath", lanczos1, 5, 1000)
133133
IF_ARCH_X86(CALL(avx512::lanczos1));
134134
IF_ARCH_X86_64(CALL(avx512::x64_lanczos1));
135135
IF_ARCH_ARM(CALL(neon_d32::lanczos1));
136-
// IF_ARCH_AARCH64(CALL(asimd::lanczos1));
136+
IF_ARCH_AARCH64(CALL(asimd::lanczos1));
137137
PTEST_SEPARATOR;
138138
}
139139

src/test/utest/pmath/lanczos1.cpp

+2-2
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ namespace lsp
7272
IF_ARCH_AARCH64(
7373
namespace asimd
7474
{
75-
// void lanczos1(float *dst, float k, float p, float t, float a, size_t count);
75+
void lanczos1(float *dst, float k, float p, float t, float a, size_t count);
7676
}
7777
)
7878

@@ -147,7 +147,7 @@ UTEST_BEGIN("dsp.pmath", lanczos1)
147147
IF_ARCH_X86(CALL(generic::lanczos1, avx512::lanczos1, 64));
148148
IF_ARCH_X86_64(CALL(generic::lanczos1, avx512::x64_lanczos1, 64));
149149
IF_ARCH_ARM(CALL(generic::lanczos1, neon_d32::lanczos1, 16));
150-
// IF_ARCH_AARCH64(CALL(generic::lanczos1, asimd::lanczos1, 16));
150+
IF_ARCH_AARCH64(CALL(generic::lanczos1, asimd::lanczos1, 16));
151151
}
152152
UTEST_END
153153

0 commit comments

Comments
 (0)