@@ -66,6 +66,55 @@ static inline int64_t shift_and_round( int64_t x, int shift )
6666 return ( x + ( static_cast <int64_t >(1 ) << ( shift - 1 ) ) ) >> shift;
6767}
6868
69+ template <int W>
70+ void cpyCoeff_neon ( const Pel* src, ptrdiff_t srcStride, TCoeff* dst, unsigned width, unsigned height );
71+
72+ template <>
73+ void cpyCoeff_neon<4 >( const Pel* src, ptrdiff_t srcStride, TCoeff* dst, unsigned width, unsigned height )
74+ {
75+ CHECK ( height < 1 , " Height must be >= 1" );
76+ CHECK ( width != 4 , " Width must be 4" );
77+
78+ do
79+ {
80+ int16x4_t s = vld1_s16 ( src );
81+ vst1q_s32 ( dst, vmovl_s16 ( s ) );
82+
83+ src += srcStride;
84+ dst += width;
85+ } while ( --height != 0 );
86+ }
87+
88+ template <>
89+ void cpyCoeff_neon<8 >( const Pel* src, ptrdiff_t srcStride, TCoeff* dst, unsigned width, unsigned height )
90+ {
91+ CHECK ( height < 1 , " Height must be >= 1" );
92+ CHECK ( width < 8 || width & 7 , " Width must be >= 8 and a multiple of 8" );
93+
94+ do
95+ {
96+ unsigned w = 0 ;
97+ for ( ; w + 16 <= width; w += 16 )
98+ {
99+ int16x8_t s_lo = vld1q_s16 ( src + w + 0 );
100+ int16x8_t s_hi = vld1q_s16 ( src + w + 8 );
101+ vst1q_s32 ( dst + w + 0 , vmovl_s16 ( vget_low_s16 ( s_lo ) ) );
102+ vst1q_s32 ( dst + w + 4 , vmovl_s16 ( vget_high_s16 ( s_lo ) ) );
103+ vst1q_s32 ( dst + w + 8 , vmovl_s16 ( vget_low_s16 ( s_hi ) ) );
104+ vst1q_s32 ( dst + w + 12 , vmovl_s16 ( vget_high_s16 ( s_hi ) ) );
105+ }
106+ if ( width & 8 )
107+ {
108+ int16x8_t s = vld1q_s16 ( src + w );
109+ vst1q_s32 ( dst + w + 0 , vmovl_s16 ( vget_low_s16 ( s ) ) );
110+ vst1q_s32 ( dst + w + 4 , vmovl_s16 ( vget_high_s16 ( s ) ) );
111+ }
112+
113+ src += srcStride;
114+ dst += width;
115+ } while ( --height != 0 );
116+ }
117+
69118template <unsigned trSize>
70119static void fastInvCore_neon ( const TMatrixCoeff* it, const TCoeff* src, TCoeff* dst, unsigned lines,
71120 unsigned reducedLines, unsigned rows )
@@ -264,6 +313,9 @@ static void fastFwdCore_neon( const TMatrixCoeff* tc, const TCoeff* src, TCoeff*
264313template <>
265314void TCoeffOps::_initTCoeffOpsARM<NEON>()
266315{
316+ cpyCoeff4 = cpyCoeff_neon<4 >;
317+ cpyCoeff8 = cpyCoeff_neon<8 >;
318+
267319 fastInvCore[ 0 ] = fastInvCore_neon<4 >;
268320 fastInvCore[ 1 ] = fastInvCore_neon<8 >;
269321 fastInvCore[ 2 ] = fastInvCore_neon<16 >;
0 commit comments