|
| 1 | +/* |
| 2 | + * Copyright (C) 2025 Linux Studio Plugins Project <https://lsp-plug.in/> |
| 3 | + * (C) 2025 Vladimir Sadovnikov <[email protected]> |
| 4 | + * |
| 5 | + * This file is part of lsp-dsp-lib |
| 6 | + * Created on: 22 апр. 2025 г. |
| 7 | + * |
| 8 | + * lsp-dsp-lib is free software: you can redistribute it and/or modify |
| 9 | + * it under the terms of the GNU Lesser General Public License as published by |
| 10 | + * the Free Software Foundation, either version 3 of the License, or |
| 11 | + * any later version. |
| 12 | + * |
| 13 | + * lsp-dsp-lib is distributed in the hope that it will be useful, |
| 14 | + * but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 15 | + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 16 | + * GNU Lesser General Public License for more details. |
| 17 | + * |
| 18 | + * You should have received a copy of the GNU Lesser General Public License |
| 19 | + * along with lsp-dsp-lib. If not, see <https://www.gnu.org/licenses/>. |
| 20 | + */ |
| 21 | + |
| 22 | +#ifndef PRIVATE_DSP_ARCH_AARCH64_ASIMD_PMATH_LANCZOS_H_ |
| 23 | +#define PRIVATE_DSP_ARCH_AARCH64_ASIMD_PMATH_LANCZOS_H_ |
| 24 | + |
| 25 | +#ifndef PRIVATE_DSP_ARCH_AARCH64_ASIMD_IMPL |
| 26 | + #error "This header should not be included directly" |
| 27 | +#endif /* PRIVATE_DSP_ARCH_AARCH64_ASIMD_IMPL */ |
| 28 | + |
| 29 | +#include <private/dsp/arch/aarch64/asimd/pmath/sin.h> |
| 30 | + |
| 31 | +namespace lsp |
| 32 | +{ |
| 33 | + namespace asimd |
| 34 | + { |
| 35 | + IF_ARCH_AARCH64( |
| 36 | + static const uint32_t lanczos_const[] __lsp_aligned16 = |
| 37 | + { |
| 38 | + LSP_DSP_VEC4(0x358637bd), // +0x00: Sinc threshold = 1e-6 |
| 39 | + LSP_DSP_VEC4(0x3f800000), // +0x10: 1.0 |
| 40 | + }; |
| 41 | + ) |
| 42 | + |
| 43 | + typedef struct lanczos_gen_t |
| 44 | + { |
| 45 | + float x1[8]; // +0x00: Computed X1 |
| 46 | + float n[8]; // +0x20: Numerator |
| 47 | + } lanczos_gen_t; |
| 48 | + |
| 49 | + #define LANCZOS_GEN_FUNC_X8 \ |
| 50 | + /* v0 = x1[0] */ \ |
| 51 | + /* v1 = x1[1] */ \ |
| 52 | + __ASM_EMIT("fmul v2.4s, v0.4s, v27.4s") /* v2 = x2 = x1*a */ \ |
| 53 | + __ASM_EMIT("fmul v3.4s, v1.4s, v27.4s") \ |
| 54 | + __ASM_EMIT("fabs v4.4s, v0.4s") /* v4 = fabsf(x1) */ \ |
| 55 | + __ASM_EMIT("fabs v5.4s, v1.4s") \ |
| 56 | + __ASM_EMIT("fmul v8.4s, v0.4s, v2.4s") /* v8 = x1*x2 */ \ |
| 57 | + __ASM_EMIT("fmul v9.4s, v1.4s, v3.4s") \ |
| 58 | + __ASM_EMIT("frecpe v10.4s, v8.4s") /* v10 = s2 */ \ |
| 59 | + __ASM_EMIT("frecpe v11.4s, v9.4s") \ |
| 60 | + __ASM_EMIT("frecps v12.4s, v10.4s, v8.4s") /* v12 = (2 - R*s2) */ \ |
| 61 | + __ASM_EMIT("frecps v13.4s, v11.4s, v9.4s") \ |
| 62 | + __ASM_EMIT("fmul v10.4s, v12.4s, v10.4s") /* v10 = s2' = s2 * (2 - R*s2) */ \ |
| 63 | + __ASM_EMIT("fmul v11.4s, v13.4s, v11.4s") \ |
| 64 | + __ASM_EMIT("frecps v12.4s, v10.4s, v8.4s") /* v12 = (2 - R*s2') */ \ |
| 65 | + __ASM_EMIT("frecps v13.4s, v11.4s, v9.4s") \ |
| 66 | + __ASM_EMIT("fmul v8.4s, v12.4s, v10.4s") /* v8 = n = 1/(x1*x2) */ \ |
| 67 | + __ASM_EMIT("fmul v9.4s, v13.4s, v11.4s") \ |
| 68 | + __ASM_EMIT("fadd v0.4s, v0.4s, v16.4s") /* v0 = x1 + PI/2 */ \ |
| 69 | + __ASM_EMIT("fadd v1.4s, v1.4s, v16.4s") \ |
| 70 | + __ASM_EMIT("fadd v2.4s, v2.4s, v16.4s") /* v2 = x2 + PI/2 */ \ |
| 71 | + __ASM_EMIT("fadd v3.4s, v3.4s, v16.4s") \ |
| 72 | + __ASM_EMIT("stp q4, q5, [%[state], 0x00]") /* save fabfs(x1) */ \ |
| 73 | + __ASM_EMIT("stp q8, q9, [%[state], 0x20]") /* save n */ \ |
| 74 | + SINF_X_PLUS_PI_2_CORE_X16 \ |
| 75 | + /* v0 = sinf(x1[0]), v1 = sinf(x1[1]) */ \ |
| 76 | + /* v2 = sinf(x2[0]), v3 = sinf(x2[1]) */ \ |
| 77 | + __ASM_EMIT("ldp q4, q5, [%[state], 0x00]") /* v4 = fabsf(x1) */ \ |
| 78 | + __ASM_EMIT("ldp q6, q7, [%[state], 0x20]") /* v6 = n = 1/(x1*x2) */ \ |
| 79 | + __ASM_EMIT("ldp q8, q9, [%[LC]]") /* v8 = 1e-6, v9 = 1.0 */ \ |
| 80 | + __ASM_EMIT("fmul v0.4s, v0.4s, v2.4s") /* v0 = sinf(x1)*sinf(x2) */ \ |
| 81 | + __ASM_EMIT("fmul v1.4s, v1.4s, v3.4s") \ |
| 82 | + __ASM_EMIT("fcmge v2.4s, v4.4s, v8.4s") /* v2 = [ fabsf(x1) >= 1e-6 ] */ \ |
| 83 | + __ASM_EMIT("fcmge v3.4s, v5.4s, v8.4s") \ |
| 84 | + __ASM_EMIT("fmul v0.4s, v0.4s, v6.4s") /* v6 = sinf(x1)*sinf(x2)/(x1*x2) */ \ |
| 85 | + __ASM_EMIT("fmul v1.4s, v1.4s, v7.4s") \ |
| 86 | + __ASM_EMIT("fcmgt v4.4s, v26.4s, v4.4s") /* v4 = [ fabsf(x1) < t ] */ \ |
| 87 | + __ASM_EMIT("fcmgt v5.4s, v26.4s, v5.4s") \ |
| 88 | + __ASM_EMIT("bif v0.16b, v9.16b, v2.16b") /* v0 = [ fabsf(x1) >= 1e-6 ] ? f : 1.0 */ \ |
| 89 | + __ASM_EMIT("bif v1.16b, v9.16b, v3.16b") \ |
| 90 | + __ASM_EMIT("and v0.16b, v0.16b, v4.16b") /* v0 = [ fabsf(x1) < t ] ? ([ fabsf(x1) >= 1e-6 ] ? f : 1.0) : 0.0 */ \ |
| 91 | + __ASM_EMIT("and v1.16b, v1.16b, v5.16b") |
| 92 | + |
| 93 | + #define LANCZOS_GEN_FUNC_X4 \ |
| 94 | + /* v0 = x1 */ \ |
| 95 | + __ASM_EMIT("fmul v1.4s, v0.4s, v27.4s") /* v1 = x2 = x1*a */ \ |
| 96 | + __ASM_EMIT("fabs v2.4s, v0.4s") /* v2 = fabsf(x1) */ \ |
| 97 | + __ASM_EMIT("fmul v3.4s, v0.4s, v1.4s") /* v3 = x1*x2 */ \ |
| 98 | + __ASM_EMIT("frecpe v10.4s, v3.4s") /* v10 = s2 */ \ |
| 99 | + __ASM_EMIT("frecps v12.4s, v10.4s, v3.4s") /* v12 = (2 - R*s2) */ \ |
| 100 | + __ASM_EMIT("fmul v10.4s, v12.4s, v10.4s") /* v10 = s2' = s2 * (2 - R*s2) */ \ |
| 101 | + __ASM_EMIT("frecps v12.4s, v10.4s, v3.4s") /* v12 = (2 - R*s2') */ \ |
| 102 | + __ASM_EMIT("fmul v3.4s, v12.4s, v10.4s") /* v3 = n = 1/(x1*x2) */ \ |
| 103 | + __ASM_EMIT("fadd v0.4s, v0.4s, v16.4s") /* v0 = x1 + PI/2 */ \ |
| 104 | + __ASM_EMIT("fadd v1.4s, v1.4s, v16.4s") /* v1 = x2 + PI/2 */ \ |
| 105 | + SINF_X_PLUS_PI_2_CORE_X8 \ |
| 106 | + /* v0 = sinf(x1), v1 = sinf(x2) */ \ |
| 107 | + __ASM_EMIT("ldp q8, q9, [%[LC]]") /* v8 = 1e-6, v9 = 1.0 */ \ |
| 108 | + __ASM_EMIT("fmul v0.4s, v0.4s, v1.4s") /* v0 = sinf(x1)*sinf(x2) */ \ |
| 109 | + __ASM_EMIT("fcmge v4.4s, v2.4s, v8.4s") /* v4 = [ fabsf(x1) >= 1e-6 ] */ \ |
| 110 | + __ASM_EMIT("fmul v0.4s, v0.4s, v3.4s") /* v0 = sinf(x1)*sinf(x2)/(x1*x2) */ \ |
| 111 | + __ASM_EMIT("fcmgt v5.4s, v26.4s, v2.4s") /* v5 = [ fabsf(x1) < t ] */ \ |
| 112 | + __ASM_EMIT("bif v0.16b, v9.16b, v4.16b") /* v0 = [ fabsf(x1) >= 1e-6 ] ? f : 1.0 */ \ |
| 113 | + __ASM_EMIT("and v0.16b, v0.16b, v5.16b") /* v0 = [ fabsf(x1) < t ] ? ([ fabsf(x1) >= 1e-6 ] ? f : 1.0) : 0.0 */ |
| 114 | + |
| 115 | + |
| 116 | + void lanczos1(float *dst, float k, float p, float t, float a, size_t count) |
| 117 | + { |
| 118 | + IF_ARCH_AARCH64( |
| 119 | + lanczos_gen_t state; |
| 120 | + ); |
| 121 | + ARCH_AARCH64_ASM( |
| 122 | + __ASM_EMIT("ldp q16, q17, [%[S2C], #0x00]") /* v16 = PI/2, v17 = PI */ |
| 123 | + __ASM_EMIT("ldp q18, q19, [%[S2C], #0x20]") /* v18 = 1/(2*PI), v19 = C0 */ |
| 124 | + __ASM_EMIT("ldp q20, q21, [%[S2C], #0x40]") /* v20 = C1, v21 = C2 */ |
| 125 | + __ASM_EMIT("ldp q22, q23, [%[S2C], #0x60]") /* v22 = C3, v23 = C4 */ |
| 126 | + __ASM_EMIT("ld1r {v24.4s}, [%[k]]") /* v24 = k */ |
| 127 | + __ASM_EMIT("ld1r {v25.4s}, [%[p]]") /* v25 = p */ |
| 128 | + __ASM_EMIT("ld1r {v26.4s}, [%[t]]") /* v26 = t */ |
| 129 | + __ASM_EMIT("ld1r {v27.4s}, [%[a]]") /* v27 = a */ |
| 130 | + __ASM_EMIT("ldp q28, q29, [%[S2KP], #0x00]") /* v28 = i0 = 0 1 2 3, v29 = i1 = 4 5 6 7 */ |
| 131 | + __ASM_EMIT("ldp q30, q31, [%[S2KP], #0x20]") /* v30 = s0 = 8 8 8 8, v31 = s1 = 4 4 4 4 */ |
| 132 | + // x8 blocks |
| 133 | + __ASM_EMIT("subs %[count], %[count], #8") |
| 134 | + __ASM_EMIT("b.lo 2f") |
| 135 | + __ASM_EMIT("1:") |
| 136 | + __ASM_EMIT("fmul v0.4s, v28.4s, v24.4s") // v0 = k*i0 |
| 137 | + __ASM_EMIT("fmul v1.4s, v29.4s, v24.4s") // v1 = k*i1 |
| 138 | + __ASM_EMIT("fadd v28.4s, v28.4s, v30.4s") // v28 = i0 + step |
| 139 | + __ASM_EMIT("fadd v29.4s, v29.4s, v30.4s") // v29 = i1 + step |
| 140 | + __ASM_EMIT("fsub v0.4s, v0.4s, v25.4s") // v0 = k*i0 - p |
| 141 | + __ASM_EMIT("fsub v1.4s, v1.4s, v25.4s") // v1 = k*i1 - p |
| 142 | + LANCZOS_GEN_FUNC_X8 |
| 143 | + __ASM_EMIT("subs %[count], %[count], #8") |
| 144 | + __ASM_EMIT("stp q0, q1, [%[dst], #0x00]") |
| 145 | + __ASM_EMIT("add %[dst], %[dst], #0x20") |
| 146 | + __ASM_EMIT("b.hs 1b") |
| 147 | + __ASM_EMIT("2:") |
| 148 | + // x4 block |
| 149 | + __ASM_EMIT("adds %[count], %[count], #4") |
| 150 | + __ASM_EMIT("b.lt 4f") |
| 151 | + __ASM_EMIT("fmul v0.4s, v28.4s, v24.4s") // v0 = k*i0 |
| 152 | + __ASM_EMIT("fadd v28.4s, v28.4s, v31.4s") // v28 = i0 + step |
| 153 | + __ASM_EMIT("fsub v0.4s, v0.4s, v25.4s") // v0 = k*i0 - p |
| 154 | + LANCZOS_GEN_FUNC_X4 |
| 155 | + __ASM_EMIT("sub %[count], %[count], #4") |
| 156 | + __ASM_EMIT("str q0, [%[dst], #0x00]") |
| 157 | + __ASM_EMIT("add %[dst], %[dst], #0x10") |
| 158 | + __ASM_EMIT("4:") |
| 159 | + // Tail: 1x-3x block |
| 160 | + __ASM_EMIT("adds %[count], %[count], #4") |
| 161 | + __ASM_EMIT("b.ls 8f") |
| 162 | + __ASM_EMIT("fmul v0.4s, v28.4s, v24.4s") // v0 = k*i0 |
| 163 | + __ASM_EMIT("fsub v0.4s, v0.4s, v25.4s") // v0 = k*i0 - p |
| 164 | + LANCZOS_GEN_FUNC_X4 |
| 165 | + __ASM_EMIT("tst %[count], #1") |
| 166 | + __ASM_EMIT("b.eq 6f") |
| 167 | + __ASM_EMIT("st1 {v0.s}[0], [%[dst]]") |
| 168 | + __ASM_EMIT("add %[dst], %[dst], #0x04") |
| 169 | + __ASM_EMIT("ext v0.16b, v0.16b, v0.16b, #4") // v0 = S1 S2 S3 S0 |
| 170 | + __ASM_EMIT("6:") |
| 171 | + __ASM_EMIT("tst %[count], #2") |
| 172 | + __ASM_EMIT("b.eq 8f") |
| 173 | + __ASM_EMIT("st1 {v0.d}[0], [%[dst]]") |
| 174 | + // End |
| 175 | + __ASM_EMIT("8:") |
| 176 | + |
| 177 | + : [dst] "+r" (dst), [count] "+r" (count) |
| 178 | + : [k] "r" (&k), |
| 179 | + [p] "r" (&p), |
| 180 | + [t] "r" (&t), |
| 181 | + [a] "r" (&a), |
| 182 | + [S2C] "r" (&sinf_const[0]), |
| 183 | + [S2KP] "r" (&kp_gen_const[0]), |
| 184 | + [LC] "r" (&lanczos_const[0]), |
| 185 | + [state] "r" (&state) |
| 186 | + : "cc", "memory", |
| 187 | + "v0", "v1", "v2", "v3", |
| 188 | + "v4", "v5", "v6", "v7", |
| 189 | + "v8", "v9", "v10", "v11", |
| 190 | + "v12", "v13", "v14", "v15", |
| 191 | + "v16", "v17", "v18", "v19", |
| 192 | + "v20", "v21", "v22", "v23", |
| 193 | + "v24", "v25", "v26", "v27", |
| 194 | + "v28", "v29", "v30", "v31" |
| 195 | + ); |
| 196 | + } |
| 197 | + |
| 198 | + } /* namespace asimd */ |
| 199 | +} /* namespace lsp */ |
| 200 | + |
| 201 | + |
| 202 | + |
| 203 | + |
| 204 | +#endif /* PRIVATE_DSP_ARCH_AARCH64_ASIMD_PMATH_LANCZOS_H_ */ |
0 commit comments