@@ -60,6 +60,7 @@ POSSIBILITY OF SUCH DAMAGE.
6060#if defined( TARGET_SIMD_ARM ) && ENABLE_SIMD_OPT_MCTF
6161
6262#include " MCTF_neon.h"
63+ #include " mem_neon.h"
6364#include " neon_sve_bridge.h"
6465#include < arm_neon.h>
6566#include < arm_sve.h>
@@ -118,6 +119,149 @@ int motionErrorLumaInt_sve( const Pel* org, const ptrdiff_t origStride, const Pe
118119 return error;
119120}
120121
122+ template <FilterCoeffType4 xType, FilterCoeffType4 yType>
123+ static inline int motionErrorLumaFrac_loRes2D_sve ( const Pel* org, const ptrdiff_t origStride, const Pel* buf,
124+ const ptrdiff_t buffStride, int w, int h, const int16_t * xFilter,
125+ const int16_t * yFilter, const int bitDepth, const int besterror )
126+ {
127+ const Pel maxSampleValue = ( 1 << bitDepth ) - 1 ;
128+
129+ CHECKD ( w % 8 != 0 , " Width must be multiple of 8!" );
130+ CHECKD ( h % 4 != 0 , " Height must be multiple of 4!" );
131+
132+ const int16x4_t xf = vrshr_n_s16 ( vld1_s16 ( xFilter ), 1 );
133+ const int16x4_t yf = vrshr_n_s16 ( vld1_s16 ( yFilter ), 1 );
134+
135+ constexpr int numFilterTaps = 4 ;
136+ int16x8_t h_src[numFilterTaps];
137+ int16x8_t v_src[numFilterTaps + 3 ]; // 3 extra elements are needed because the height loop is unrolled 4 times.
138+
139+ int error = 0 ;
140+
141+ do
142+ {
143+ load_s16_16x8x4 ( buf - 1 * buffStride - 1 , 1 , h_src );
144+ v_src[0 ] = motionErrorLumaFrac_loRes1D_neon<xType>( h_src, xf, maxSampleValue );
145+
146+ load_s16_16x8x4 ( buf + 0 * buffStride - 1 , 1 , h_src );
147+ v_src[1 ] = motionErrorLumaFrac_loRes1D_neon<xType>( h_src, xf, maxSampleValue );
148+
149+ load_s16_16x8x4 ( buf + 1 * buffStride - 1 , 1 , h_src );
150+ v_src[2 ] = motionErrorLumaFrac_loRes1D_neon<xType>( h_src, xf, maxSampleValue );
151+
152+ const Pel* rowStart = buf + 2 * buffStride - 1 ;
153+ const Pel* origRow = org;
154+
155+ int64x2_t diffSq0 = vdupq_n_s64 ( 0 );
156+ int64x2_t diffSq1 = vdupq_n_s64 ( 0 );
157+
158+ int y = h;
159+ do
160+ {
161+ load_s16_16x8x4 ( rowStart + 0 * buffStride, 1 , h_src );
162+ v_src[3 ] = motionErrorLumaFrac_loRes1D_neon<xType>( h_src, xf, maxSampleValue );
163+
164+ load_s16_16x8x4 ( rowStart + 1 * buffStride, 1 , h_src );
165+ v_src[4 ] = motionErrorLumaFrac_loRes1D_neon<xType>( h_src, xf, maxSampleValue );
166+
167+ load_s16_16x8x4 ( rowStart + 2 * buffStride, 1 , h_src );
168+ v_src[5 ] = motionErrorLumaFrac_loRes1D_neon<xType>( h_src, xf, maxSampleValue );
169+
170+ load_s16_16x8x4 ( rowStart + 3 * buffStride, 1 , h_src );
171+ v_src[6 ] = motionErrorLumaFrac_loRes1D_neon<xType>( h_src, xf, maxSampleValue );
172+
173+ int16x8_t ysum0 = motionErrorLumaFrac_loRes1D_neon<yType>( &v_src[0 ], yf, maxSampleValue );
174+ int16x8_t ysum1 = motionErrorLumaFrac_loRes1D_neon<yType>( &v_src[1 ], yf, maxSampleValue );
175+ int16x8_t ysum2 = motionErrorLumaFrac_loRes1D_neon<yType>( &v_src[2 ], yf, maxSampleValue );
176+ int16x8_t ysum3 = motionErrorLumaFrac_loRes1D_neon<yType>( &v_src[3 ], yf, maxSampleValue );
177+
178+ int16x8_t orig0 = vld1q_s16 ( origRow + 0 * origStride );
179+ int16x8_t orig1 = vld1q_s16 ( origRow + 1 * origStride );
180+ int16x8_t orig2 = vld1q_s16 ( origRow + 2 * origStride );
181+ int16x8_t orig3 = vld1q_s16 ( origRow + 3 * origStride );
182+
183+ int16x8_t diff0 = vabdq_s16 ( ysum0, orig0 );
184+ int16x8_t diff1 = vabdq_s16 ( ysum1, orig1 );
185+ int16x8_t diff2 = vabdq_s16 ( ysum2, orig2 );
186+ int16x8_t diff3 = vabdq_s16 ( ysum3, orig3 );
187+
188+ diffSq0 = vvenc_sdotq_s16 ( diffSq0, diff0, diff0 );
189+ diffSq0 = vvenc_sdotq_s16 ( diffSq0, diff1, diff1 );
190+ diffSq1 = vvenc_sdotq_s16 ( diffSq1, diff2, diff2 );
191+ diffSq1 = vvenc_sdotq_s16 ( diffSq1, diff3, diff3 );
192+
193+ v_src[0 ] = v_src[4 ];
194+ v_src[1 ] = v_src[5 ];
195+ v_src[2 ] = v_src[6 ];
196+
197+ rowStart += 4 * buffStride;
198+ origRow += 4 * origStride;
199+ y -= 4 ;
200+ } while ( y != 0 );
201+
202+ int64x2_t diffSq = vaddq_s64 ( diffSq0, diffSq1 );
203+ error += ( int32_t )vaddvq_s64 ( diffSq );
204+ if ( error > besterror )
205+ {
206+ return error;
207+ }
208+
209+ buf += 8 ;
210+ org += 8 ;
211+ w -= 8 ;
212+ } while ( w != 0 );
213+
214+ return error;
215+ }
216+
217+ template <FilterCoeffType4 xType>
218+ static inline auto get_motionErrorLumaFrac2D ( FilterCoeffType4 type )
219+ {
220+ switch ( type )
221+ {
222+ case FilterCoeffType4::SkewLeft:
223+ return &motionErrorLumaFrac_loRes2D_sve<xType, FilterCoeffType4::SkewLeft>;
224+ case FilterCoeffType4::SkewRight:
225+ return &motionErrorLumaFrac_loRes2D_sve<xType, FilterCoeffType4::SkewRight>;
226+ case FilterCoeffType4::FullSymmetric:
227+ return &motionErrorLumaFrac_loRes2D_sve<xType, FilterCoeffType4::FullSymmetric>;
228+ case FilterCoeffType4::Generic:
229+ default :
230+ return &motionErrorLumaFrac_loRes2D_sve<xType, FilterCoeffType4::Generic>;
231+ }
232+ }
233+
234+ int motionErrorLumaFrac_loRes_sve ( const Pel* org, const ptrdiff_t origStride, const Pel* buf,
235+ const ptrdiff_t buffStride, const int w, const int h, const int16_t * xFilter,
236+ const int16_t * yFilter, const int bitDepth, const int besterror )
237+ {
238+ const FilterCoeffType4 xType = selectFilterType4 ( xFilter );
239+ const FilterCoeffType4 yType = selectFilterType4 ( yFilter );
240+
241+ using motionErrorLumaFrac_loResFunc = int ( * )( const Pel*, const ptrdiff_t , const Pel*, const ptrdiff_t , const int ,
242+ const int , const int16_t *, const int16_t *, const int , const int );
243+ motionErrorLumaFrac_loResFunc func;
244+
245+ switch ( xType )
246+ {
247+ case FilterCoeffType4::SkewLeft:
248+ func = get_motionErrorLumaFrac2D<FilterCoeffType4::SkewLeft>( yType );
249+ break ;
250+ case FilterCoeffType4::SkewRight:
251+ func = get_motionErrorLumaFrac2D<FilterCoeffType4::SkewRight>( yType );
252+ break ;
253+ case FilterCoeffType4::FullSymmetric:
254+ func = get_motionErrorLumaFrac2D<FilterCoeffType4::FullSymmetric>( yType );
255+ break ;
256+ case FilterCoeffType4::Generic:
257+ default :
258+ func = get_motionErrorLumaFrac2D<FilterCoeffType4::Generic>( yType );
259+ break ;
260+ }
261+
262+ return func ( org, origStride, buf, buffStride, w, h, xFilter, yFilter, bitDepth, besterror );
263+ }
264+
121265void applyPlanarCorrection_sve ( const Pel* refPel, const ptrdiff_t refStride, Pel* dstPel, const ptrdiff_t dstStride,
122266 const int32_t w, const int32_t h, const ClpRng& clpRng, const uint16_t motionError )
123267{
@@ -406,6 +550,7 @@ template<>
406550void MCTF::_initMCTF_ARM<SVE>()
407551{
408552 m_motionErrorLumaInt8 = motionErrorLumaInt_sve;
553+ m_motionErrorLumaFrac8[1 ] = motionErrorLumaFrac_loRes_sve;
409554 m_applyPlanarCorrection = applyPlanarCorrection_sve;
410555 m_applyBlock = applyBlock_sve;
411556}
0 commit comments