Skip to content

Commit 5c2b2cb

Browse files
authored
Merge pull request fraunhoferhhi#576 from micro-arm/cpycoeff-neon
Arm: Add Neon implementations of TCoeffOps::cpyCoeff kernels
2 parents 5b6d4e9 + ba174b3 commit 5c2b2cb

File tree

1 file changed

+52
-0
lines changed

1 file changed

+52
-0
lines changed

source/Lib/CommonLib/arm/neon/Trafo_neon.cpp

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,55 @@ static inline int64_t shift_and_round( int64_t x, int shift )
6666
return ( x + ( static_cast<int64_t>(1) << ( shift - 1 ) ) ) >> shift;
6767
}
6868

69+
template<int W>
70+
void cpyCoeff_neon( const Pel* src, ptrdiff_t srcStride, TCoeff* dst, unsigned width, unsigned height );
71+
72+
template<>
73+
void cpyCoeff_neon<4>( const Pel* src, ptrdiff_t srcStride, TCoeff* dst, unsigned width, unsigned height )
74+
{
75+
CHECK( height < 1, "Height must be >= 1" );
76+
CHECK( width != 4, "Width must be 4" );
77+
78+
do
79+
{
80+
int16x4_t s = vld1_s16( src );
81+
vst1q_s32( dst, vmovl_s16( s ) );
82+
83+
src += srcStride;
84+
dst += width;
85+
} while( --height != 0 );
86+
}
87+
88+
template<>
89+
void cpyCoeff_neon<8>( const Pel* src, ptrdiff_t srcStride, TCoeff* dst, unsigned width, unsigned height )
90+
{
91+
CHECK( height < 1, "Height must be >= 1" );
92+
CHECK( width < 8 || width & 7, "Width must be >= 8 and a multiple of 8" );
93+
94+
do
95+
{
96+
unsigned w = 0;
97+
for( ; w + 16 <= width; w += 16 )
98+
{
99+
int16x8_t s_lo = vld1q_s16( src + w + 0 );
100+
int16x8_t s_hi = vld1q_s16( src + w + 8 );
101+
vst1q_s32( dst + w + 0, vmovl_s16( vget_low_s16( s_lo ) ) );
102+
vst1q_s32( dst + w + 4, vmovl_s16( vget_high_s16( s_lo ) ) );
103+
vst1q_s32( dst + w + 8, vmovl_s16( vget_low_s16( s_hi ) ) );
104+
vst1q_s32( dst + w + 12, vmovl_s16( vget_high_s16( s_hi ) ) );
105+
}
106+
if( width & 8 )
107+
{
108+
int16x8_t s = vld1q_s16( src + w );
109+
vst1q_s32( dst + w + 0, vmovl_s16( vget_low_s16( s ) ) );
110+
vst1q_s32( dst + w + 4, vmovl_s16( vget_high_s16( s ) ) );
111+
}
112+
113+
src += srcStride;
114+
dst += width;
115+
} while( --height != 0 );
116+
}
117+
69118
template<unsigned trSize>
70119
static void fastInvCore_neon( const TMatrixCoeff* it, const TCoeff* src, TCoeff* dst, unsigned lines,
71120
unsigned reducedLines, unsigned rows )
@@ -264,6 +313,9 @@ static void fastFwdCore_neon( const TMatrixCoeff* tc, const TCoeff* src, TCoeff*
264313
template<>
265314
void TCoeffOps::_initTCoeffOpsARM<NEON>()
266315
{
316+
cpyCoeff4 = cpyCoeff_neon<4>;
317+
cpyCoeff8 = cpyCoeff_neon<8>;
318+
267319
fastInvCore[ 0 ] = fastInvCore_neon<4>;
268320
fastInvCore[ 1 ] = fastInvCore_neon<8>;
269321
fastInvCore[ 2 ] = fastInvCore_neon<16>;

0 commit comments

Comments
 (0)