Skip to content

Commit c21c0b6

Browse files
committed
Performance improvement: replaced vgatherdps instructions with multiple
scalar loads
1 parent 123beff commit c21c0b6

File tree

4 files changed

+118
-251
lines changed

4 files changed

+118
-251
lines changed

include/private/dsp/arch/x86/avx512/fft.h

+6-9
Original file line numberDiff line numberDiff line change
@@ -29,22 +29,19 @@
2929
#include <private/dsp/arch/x86/avx512/fft/const.h>
3030
#include <private/dsp/arch/x86/avx512/fft/normalize.h>
3131
#include <private/dsp/arch/x86/avx512/fft/butterfly.h>
32-
#include <private/dsp/arch/x86/avx512/fft/rbits.h>
3332

3433
#define FFT_SCRAMBLE_SELF_DIRECT_NAME scramble_self_direct8
3534
#define FFT_SCRAMBLE_SELF_REVERSE_NAME scramble_self_reverse8
3635
#define FFT_SCRAMBLE_COPY_DIRECT_NAME scramble_copy_direct8
3736
#define FFT_SCRAMBLE_COPY_REVERSE_NAME scramble_copy_reverse8
3837
#define FFT_TYPE uint8_t
39-
#define FFT_REVERSE_BITS FFT_REVERSE_BITS8
4038
#include <private/dsp/arch/x86/avx512/fft/scramble.h>
4139

4240
#define FFT_SCRAMBLE_SELF_DIRECT_NAME scramble_self_direct16
4341
#define FFT_SCRAMBLE_SELF_REVERSE_NAME scramble_self_reverse16
4442
#define FFT_SCRAMBLE_COPY_DIRECT_NAME scramble_copy_direct16
4543
#define FFT_SCRAMBLE_COPY_REVERSE_NAME scramble_copy_reverse16
4644
#define FFT_TYPE uint16_t
47-
#define FFT_REVERSE_BITS FFT_REVERSE_BITS16
4845
#include <private/dsp/arch/x86/avx512/fft/scramble.h>
4946

5047
namespace lsp
@@ -156,10 +153,10 @@ namespace lsp
156153
}
157154
else
158155
{
159-
if (rank <= 8)
160-
scramble_copy_direct8(dst_re, dst_im, src_re, src_im, rank);
156+
if (rank <= 13)
157+
scramble_copy_direct8(dst_re, dst_im, src_re, src_im, rank - 5);
161158
else
162-
scramble_copy_direct16(dst_re, dst_im, src_re, src_im, rank);
159+
scramble_copy_direct16(dst_re, dst_im, src_re, src_im, rank - 5);
163160
}
164161

165162
for (size_t i=4; i < rank; ++i)
@@ -186,10 +183,10 @@ namespace lsp
186183
}
187184
else
188185
{
189-
if (rank <= 8)
190-
scramble_copy_reverse8(dst_re, dst_im, src_re, src_im, rank);
186+
if (rank <= 13)
187+
scramble_copy_reverse8(dst_re, dst_im, src_re, src_im, rank - 5);
191188
else
192-
scramble_copy_reverse16(dst_re, dst_im, src_re, src_im, rank);
189+
scramble_copy_reverse16(dst_re, dst_im, src_re, src_im, rank - 5);
193190
}
194191

195192
for (size_t i=4; i < rank; ++i)

include/private/dsp/arch/x86/avx512/fft/const.h

+3-10
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,7 @@ namespace lsp
103103
LSP_DSP_VEC16(0.9999999954041073), LSP_DSP_VEC16(0.0000958737990960), // rank = 18
104104
};
105105

106-
static const uint32_t FFT_SCRAMBLE_DIRECT_INDICES[] __lsp_aligned64 =
106+
static const uint32_t FFT_SCRAMBLE_INDICES[] __lsp_aligned64 =
107107
{
108108
// butterfly 1
109109
0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, // + 0x000
@@ -119,15 +119,8 @@ namespace lsp
119119
// butterfly 4
120120
0, 1, 2, 3, 16, 17, 18, 19, 8, 9, 10, 11, 24, 25, 26, 27, // + 0x200
121121
4, 5, 6, 7, 20, 21, 22, 23, 12, 13, 14, 15, 28, 29, 30, 31, // + 0x240
122-
};
123-
124-
static const uint32_t FFT_RBITS[] __lsp_aligned64 =
125-
{
126-
0xff00ff00, // step2
127-
0xf0f0f0f0, // step3
128-
0xcccccccc, // step4
129-
0xaaaaaaaa, // step5
130-
32, // increment
122+
// permute 16x
123+
0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15, // + 0x280
131124
};
132125

133126
} /* namespace avx512 */

include/private/dsp/arch/x86/avx512/fft/rbits.h

-104
This file was deleted.

0 commit comments

Comments
 (0)