Skip to content

Commit 540f495

Browse files
committed
Arm: Add SVE implementation of motionErrorLumaInt
Add an SVE implementation of motionErrorLumaInt, making use of the SVE 16-bit dot-product instructions. Compared to the existing Neon implementation, this new version runs about 25-30% faster when benchmarked on a Neoverse V2 micro-architecture with LLVM 19.
1 parent 601c141 commit 540f495

File tree

1 file changed

+53
-0
lines changed

1 file changed

+53
-0
lines changed

source/Lib/CommonLib/arm/sve/MCTF_sve.cpp

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,58 @@ POSSIBILITY OF SUCH DAMAGE.
6666

6767
namespace vvenc
6868
{
69+
int motionErrorLumaInt_sve( const Pel* org, const ptrdiff_t origStride, const Pel* buf, const ptrdiff_t buffStride,
70+
const int w, int h, const int besterror )
71+
{
72+
CHECKD( w % 8 != 0, "Width must be a multiple of eight" );
73+
CHECKD( h % 4 != 0, "Height must be a multiple of four" );
74+
75+
int error = 0;
76+
do
77+
{
78+
int64x2_t acc1 = vdupq_n_s64( 0 );
79+
int64x2_t acc2 = vdupq_n_s64( 0 );
80+
81+
int x1 = 0;
82+
do
83+
{
84+
int16x8_t o1 = vld1q_s16( org + 0 * origStride + x1 );
85+
int16x8_t b1 = vld1q_s16( buf + 0 * origStride + x1 );
86+
int16x8_t o2 = vld1q_s16( org + 1 * origStride + x1 );
87+
int16x8_t b2 = vld1q_s16( buf + 1 * buffStride + x1 );
88+
int16x8_t o3 = vld1q_s16( org + 2 * origStride + x1 );
89+
int16x8_t b3 = vld1q_s16( buf + 2 * buffStride + x1 );
90+
int16x8_t o4 = vld1q_s16( org + 3 * origStride + x1 );
91+
int16x8_t b4 = vld1q_s16( buf + 3 * buffStride + x1 );
92+
93+
int16x8_t diff1 = vabdq_s16( o1, b1 );
94+
int16x8_t diff2 = vabdq_s16( o2, b2 );
95+
int16x8_t diff3 = vabdq_s16( o3, b3 );
96+
int16x8_t diff4 = vabdq_s16( o4, b4 );
97+
98+
acc1 = vvenc_sdotq_s16( acc1, diff1, diff1 );
99+
acc2 = vvenc_sdotq_s16( acc2, diff2, diff2 );
100+
acc1 = vvenc_sdotq_s16( acc1, diff3, diff3 );
101+
acc2 = vvenc_sdotq_s16( acc2, diff4, diff4 );
102+
103+
x1 += 8;
104+
} while( x1 != w );
105+
106+
int64x2_t diff2_sum = vaddq_s64( acc1, acc2 );
107+
error += ( int32_t )vaddvq_s64( diff2_sum );
108+
if( error > besterror )
109+
{
110+
return error;
111+
}
112+
113+
org += 4 * origStride;
114+
buf += 4 * buffStride;
115+
h -= 4;
116+
} while( h != 0 );
117+
118+
return error;
119+
}
120+
69121
void applyPlanarCorrection_sve( const Pel* refPel, const ptrdiff_t refStride, Pel* dstPel, const ptrdiff_t dstStride,
70122
const int32_t w, const int32_t h, const ClpRng& clpRng, const uint16_t motionError )
71123
{
@@ -353,6 +405,7 @@ void applyBlock_sve( const CPelBuf& src, PelBuf& dst, const CompArea& blk, const
353405
template<>
354406
void MCTF::_initMCTF_ARM<SVE>()
355407
{
408+
m_motionErrorLumaInt8 = motionErrorLumaInt_sve;
356409
m_applyPlanarCorrection = applyPlanarCorrection_sve;
357410
m_applyBlock = applyBlock_sve;
358411
}

0 commit comments

Comments
 (0)