Skip to content

Commit 629d4a0

Browse files
authored
Merge pull request #621 from athulya-arm/athulya-arm/motionErrorLumaInt
Arm: Add SVE implementation of motionErrorLumaInt
2 parents 470943a + 540f495 commit 629d4a0

File tree

2 files changed

+70
-9
lines changed

2 files changed

+70
-9
lines changed

source/Lib/CommonLib/arm/neon/MCTF_neon.cpp

Lines changed: 17 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -312,7 +312,8 @@ int motionErrorLumaFrac_loRes_neon( const Pel* org, const ptrdiff_t origStride,
312312
static int motionErrorLumaInt_neon( const Pel* org, const ptrdiff_t origStride, const Pel* buf,
313313
const ptrdiff_t buffStride, const int w, int h, const int besterror )
314314
{
315-
CHECK( w % 8 != 0, "Width must be a multiple of eight" );
315+
CHECKD( w % 8 != 0, "Width must be a multiple of eight" );
316+
CHECKD( h % 2 != 0, "Height must be a multiple of two" );
316317

317318
int error = 0;
318319
do
@@ -323,12 +324,18 @@ static int motionErrorLumaInt_neon( const Pel* org, const ptrdiff_t origStride,
323324
int x1 = 0;
324325
do
325326
{
326-
int16x8_t o = vld1q_s16( org + x1 );
327-
int16x8_t b = vld1q_s16( buf + x1 );
327+
int16x8_t o1 = vld1q_s16( org + x1 );
328+
int16x8_t b1 = vld1q_s16( buf + x1 );
329+
int16x8_t o2 = vld1q_s16( org + origStride + x1 );
330+
int16x8_t b2 = vld1q_s16( buf + buffStride + x1 );
328331

329-
int16x8_t diff = vabdq_s16( o, b );
330-
acc_lo = vmlal_s16( acc_lo, vget_low_s16( diff ), vget_low_s16( diff ) );
331-
acc_hi = vmlal_s16( acc_hi, vget_high_s16( diff ), vget_high_s16( diff ) );
332+
int16x8_t diff1 = vabdq_s16( o1, b1 );
333+
int16x8_t diff2 = vabdq_s16( o2, b2 );
334+
acc_lo = vmlal_s16( acc_lo, vget_low_s16( diff1 ), vget_low_s16( diff1 ) );
335+
acc_hi = vmlal_s16( acc_hi, vget_high_s16( diff1 ), vget_high_s16( diff1 ) );
336+
337+
acc_lo = vmlal_s16( acc_lo, vget_low_s16( diff2 ), vget_low_s16( diff2 ) );
338+
acc_hi = vmlal_s16( acc_hi, vget_high_s16( diff2 ), vget_high_s16( diff2 ) );
332339

333340
x1 += 8;
334341
} while( x1 != w );
@@ -340,9 +347,10 @@ static int motionErrorLumaInt_neon( const Pel* org, const ptrdiff_t origStride,
340347
return error;
341348
}
342349

343-
org += origStride;
344-
buf += buffStride;
345-
} while( --h != 0 );
350+
org += 2 * origStride;
351+
buf += 2 * buffStride;
352+
h -= 2;
353+
} while( h != 0 );
346354

347355
return error;
348356
}

source/Lib/CommonLib/arm/sve/MCTF_sve.cpp

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,58 @@ POSSIBILITY OF SUCH DAMAGE.
6666

6767
namespace vvenc
6868
{
69+
int motionErrorLumaInt_sve( const Pel* org, const ptrdiff_t origStride, const Pel* buf, const ptrdiff_t buffStride,
70+
const int w, int h, const int besterror )
71+
{
72+
CHECKD( w % 8 != 0, "Width must be a multiple of eight" );
73+
CHECKD( h % 4 != 0, "Height must be a multiple of four" );
74+
75+
int error = 0;
76+
do
77+
{
78+
int64x2_t acc1 = vdupq_n_s64( 0 );
79+
int64x2_t acc2 = vdupq_n_s64( 0 );
80+
81+
int x1 = 0;
82+
do
83+
{
84+
int16x8_t o1 = vld1q_s16( org + 0 * origStride + x1 );
85+
int16x8_t b1 = vld1q_s16( buf + 0 * origStride + x1 );
86+
int16x8_t o2 = vld1q_s16( org + 1 * origStride + x1 );
87+
int16x8_t b2 = vld1q_s16( buf + 1 * buffStride + x1 );
88+
int16x8_t o3 = vld1q_s16( org + 2 * origStride + x1 );
89+
int16x8_t b3 = vld1q_s16( buf + 2 * buffStride + x1 );
90+
int16x8_t o4 = vld1q_s16( org + 3 * origStride + x1 );
91+
int16x8_t b4 = vld1q_s16( buf + 3 * buffStride + x1 );
92+
93+
int16x8_t diff1 = vabdq_s16( o1, b1 );
94+
int16x8_t diff2 = vabdq_s16( o2, b2 );
95+
int16x8_t diff3 = vabdq_s16( o3, b3 );
96+
int16x8_t diff4 = vabdq_s16( o4, b4 );
97+
98+
acc1 = vvenc_sdotq_s16( acc1, diff1, diff1 );
99+
acc2 = vvenc_sdotq_s16( acc2, diff2, diff2 );
100+
acc1 = vvenc_sdotq_s16( acc1, diff3, diff3 );
101+
acc2 = vvenc_sdotq_s16( acc2, diff4, diff4 );
102+
103+
x1 += 8;
104+
} while( x1 != w );
105+
106+
int64x2_t diff2_sum = vaddq_s64( acc1, acc2 );
107+
error += ( int32_t )vaddvq_s64( diff2_sum );
108+
if( error > besterror )
109+
{
110+
return error;
111+
}
112+
113+
org += 4 * origStride;
114+
buf += 4 * buffStride;
115+
h -= 4;
116+
} while( h != 0 );
117+
118+
return error;
119+
}
120+
69121
void applyPlanarCorrection_sve( const Pel* refPel, const ptrdiff_t refStride, Pel* dstPel, const ptrdiff_t dstStride,
70122
const int32_t w, const int32_t h, const ClpRng& clpRng, const uint16_t motionError )
71123
{
@@ -353,6 +405,7 @@ void applyBlock_sve( const CPelBuf& src, PelBuf& dst, const CompArea& blk, const
353405
template<>
354406
void MCTF::_initMCTF_ARM<SVE>()
355407
{
408+
m_motionErrorLumaInt8 = motionErrorLumaInt_sve;
356409
m_applyPlanarCorrection = applyPlanarCorrection_sve;
357410
m_applyBlock = applyBlock_sve;
358411
}

0 commit comments

Comments
 (0)