Skip to content

Commit 2f089c5

Browse files
committed
Arm: Add Neon implementation of InterpolationFilter::simdFilter16xH_N4
Add Neon implementation of simdFilter16xH_N4. This Neon implementation is around 1.54x faster than the SIMDe version when benchmarked on a Neoverse V2 micro-architecture using LLVM 20.
1 parent 612f42a commit 2f089c5

File tree

1 file changed

+172
-0
lines changed

1 file changed

+172
-0
lines changed

source/Lib/CommonLib/arm/neon/InterpolationFilter_neon.cpp

Lines changed: 172 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -536,6 +536,14 @@ static inline int16x8_t filter_horiz_8x1_N4_neon( Pel const* src, int16x4_t ch,
536536
return vcombine_s16( lo, hi );
537537
}
538538

539+
static inline int16x8x2_t filter_horiz_16x1_N4_neon( Pel const* src, int16x4_t ch, int32x4_t voffset1, int32x4_t invshift1st )
540+
{
541+
int16x8x2_t result;
542+
result.val[0] = filter_horiz_8x1_N4_neon( src + 0, ch, voffset1, invshift1st );
543+
result.val[1] = filter_horiz_8x1_N4_neon( src + 8, ch, voffset1, invshift1st );
544+
return result;
545+
}
546+
539547
static inline int32x4_t filter_vert_4x1_N4_neon( int16x4_t const* vsrc, int16x4_t cv, int32x4_t voffset2 )
540548
{
541549
int32x4_t vsum = vmlal_lane_s16( voffset2, vsrc[0], cv, 0 );
@@ -560,6 +568,31 @@ static inline int32x4x2_t filter_vert_8x1_N4_neon( int16x8_t const* vsrc, int16x
560568
return vsum;
561569
}
562570

571+
static inline int32x4x4_t filter_vert_16x1_N4_neon( int16x8x2_t const* vsrc, int16x4_t cv, int32x4_t voffset2 )
572+
{
573+
int32x4x4_t vsum;
574+
vsum.val[0] = vmlal_lane_s16( voffset2, vget_low_s16( vsrc[0].val[0] ), cv, 0 );
575+
vsum.val[0] = vmlal_lane_s16( vsum.val[0], vget_low_s16( vsrc[1].val[0] ), cv, 1 );
576+
vsum.val[0] = vmlal_lane_s16( vsum.val[0], vget_low_s16( vsrc[2].val[0] ), cv, 2 );
577+
vsum.val[0] = vmlal_lane_s16( vsum.val[0], vget_low_s16( vsrc[3].val[0] ), cv, 3 );
578+
579+
vsum.val[1] = vmlal_lane_s16( voffset2, vget_high_s16( vsrc[0].val[0] ), cv, 0 );
580+
vsum.val[1] = vmlal_lane_s16( vsum.val[1], vget_high_s16( vsrc[1].val[0] ), cv, 1 );
581+
vsum.val[1] = vmlal_lane_s16( vsum.val[1], vget_high_s16( vsrc[2].val[0] ), cv, 2 );
582+
vsum.val[1] = vmlal_lane_s16( vsum.val[1], vget_high_s16( vsrc[3].val[0] ), cv, 3 );
583+
584+
vsum.val[2] = vmlal_lane_s16( voffset2, vget_low_s16( vsrc[0].val[1] ), cv, 0 );
585+
vsum.val[2] = vmlal_lane_s16( vsum.val[2], vget_low_s16( vsrc[1].val[1] ), cv, 1 );
586+
vsum.val[2] = vmlal_lane_s16( vsum.val[2], vget_low_s16( vsrc[2].val[1] ), cv, 2 );
587+
vsum.val[2] = vmlal_lane_s16( vsum.val[2], vget_low_s16( vsrc[3].val[1] ), cv, 3 );
588+
589+
vsum.val[3] = vmlal_lane_s16( voffset2, vget_high_s16( vsrc[0].val[1] ), cv, 0 );
590+
vsum.val[3] = vmlal_lane_s16( vsum.val[3], vget_high_s16( vsrc[1].val[1] ), cv, 1 );
591+
vsum.val[3] = vmlal_lane_s16( vsum.val[3], vget_high_s16( vsrc[2].val[1] ), cv, 2 );
592+
vsum.val[3] = vmlal_lane_s16( vsum.val[3], vget_high_s16( vsrc[3].val[1] ), cv, 3 );
593+
return vsum;
594+
}
595+
563596
template<bool isLast>
564597
void simdFilter4x4_N4_neon( const ClpRng& clpRng, Pel const* src, int srcStride, Pel* dst, int dstStride, int width,
565598
int height, TFilterCoeff const* coeffH, TFilterCoeff const* coeffV )
@@ -752,6 +785,143 @@ void simdFilter8xH_N4_neon( const ClpRng& clpRng, Pel const* src, int srcStride,
752785
}
753786
}
754787

788+
template<bool isLast>
789+
void simdFilter16xH_N4_neon( const ClpRng& clpRng, Pel const* src, int srcStride, Pel* dst, int dstStride, int width,
790+
int height, TFilterCoeff const* coeffH, TFilterCoeff const* coeffV )
791+
{
792+
CHECKD( width != 16, "Width must be 16" );
793+
CHECKD( height < 2, "Height must be >= 2" );
794+
CHECKD( height % 4 != 0 && height != 2, "Height must be a multiple of 4" );
795+
CHECKD( IF_INTERNAL_PREC - clpRng.bd < 2, "Bit depth headroom must be at least 2" );
796+
797+
OFFSET( src, srcStride, -1, -1 );
798+
799+
const int headRoom = IF_INTERNAL_PREC - clpRng.bd;
800+
const int shift1st = IF_FILTER_PREC - headRoom;
801+
const int shift2nd = IF_FILTER_PREC + headRoom;
802+
803+
const int offset1st = -IF_INTERNAL_OFFS * ( 1 << shift1st );
804+
int offset2nd;
805+
if( isLast )
806+
{
807+
offset2nd = ( 1 << ( shift2nd - 1 ) ) + ( IF_INTERNAL_OFFS << IF_FILTER_PREC );
808+
}
809+
else
810+
{
811+
offset2nd = 0;
812+
}
813+
const int32x4_t voffset1 = vdupq_n_s32( offset1st );
814+
const int32x4_t voffset2 = vdupq_n_s32( offset2nd );
815+
const int16x8_t vibdimax = vdupq_n_s16( clpRng.max() );
816+
817+
int16x4_t ch = vld1_s16( coeffH );
818+
int16x4_t cv = vld1_s16( coeffV );
819+
820+
int32x4_t invshift1st = vdupq_n_s32( -shift1st );
821+
int32x4_t invshift2nd = vdupq_n_s32( -shift2nd );
822+
823+
int16x8x2_t vsrcv[7];
824+
vsrcv[0] = filter_horiz_16x1_N4_neon( src, ch, voffset1, invshift1st );
825+
src += srcStride;
826+
vsrcv[1] = filter_horiz_16x1_N4_neon( src, ch, voffset1, invshift1st );
827+
src += srcStride;
828+
vsrcv[2] = filter_horiz_16x1_N4_neon( src, ch, voffset1, invshift1st );
829+
src += srcStride;
830+
831+
if( height >= 4 )
832+
{
833+
do
834+
{
835+
vsrcv[3] = filter_horiz_16x1_N4_neon( src, ch, voffset1, invshift1st );
836+
src += srcStride;
837+
vsrcv[4] = filter_horiz_16x1_N4_neon( src, ch, voffset1, invshift1st );
838+
src += srcStride;
839+
vsrcv[5] = filter_horiz_16x1_N4_neon( src, ch, voffset1, invshift1st );
840+
src += srcStride;
841+
vsrcv[6] = filter_horiz_16x1_N4_neon( src, ch, voffset1, invshift1st );
842+
src += srcStride;
843+
844+
int h = 0;
845+
do
846+
{
847+
int32x4x4_t vsum = filter_vert_16x1_N4_neon( vsrcv + h, cv, voffset2 );
848+
849+
int16x8_t vsum01, vsum23;
850+
if( isLast ) // clip
851+
{
852+
int32x4_t vsum0 = vshlq_s32( vsum.val[0], invshift2nd );
853+
int32x4_t vsum1 = vshlq_s32( vsum.val[1], invshift2nd );
854+
int32x4_t vsum2 = vshlq_s32( vsum.val[2], invshift2nd );
855+
int32x4_t vsum3 = vshlq_s32( vsum.val[3], invshift2nd );
856+
857+
uint16x8_t usum01 = vcombine_u16( vqmovun_s32( vsum0 ), vqmovun_s32( vsum1 ) );
858+
uint16x8_t usum23 = vcombine_u16( vqmovun_s32( vsum2 ), vqmovun_s32( vsum3 ) );
859+
860+
vsum01 = vminq_s16( vibdimax, vreinterpretq_s16_u16( usum01 ) );
861+
vsum23 = vminq_s16( vibdimax, vreinterpretq_s16_u16( usum23 ) );
862+
}
863+
else
864+
{
865+
vsum01 =
866+
vcombine_s16( vqshrn_n_s32( vsum.val[0], IF_FILTER_PREC ), vqshrn_n_s32( vsum.val[1], IF_FILTER_PREC ) );
867+
vsum23 =
868+
vcombine_s16( vqshrn_n_s32( vsum.val[2], IF_FILTER_PREC ), vqshrn_n_s32( vsum.val[3], IF_FILTER_PREC ) );
869+
}
870+
871+
vst1q_s16( dst + 0, vsum01 );
872+
vst1q_s16( dst + 8, vsum23 );
873+
874+
dst += dstStride;
875+
} while( ++h != 4 );
876+
877+
vsrcv[0] = vsrcv[4];
878+
vsrcv[1] = vsrcv[5];
879+
vsrcv[2] = vsrcv[6];
880+
881+
height -= 4;
882+
} while( height != 0 );
883+
}
884+
else // height == 2
885+
{
886+
vsrcv[3] = filter_horiz_16x1_N4_neon( src, ch, voffset1, invshift1st );
887+
src += srcStride;
888+
vsrcv[4] = filter_horiz_16x1_N4_neon( src, ch, voffset1, invshift1st );
889+
890+
int h = 0;
891+
do
892+
{
893+
int32x4x4_t vsum = filter_vert_16x1_N4_neon( vsrcv + h, cv, voffset2 );
894+
895+
int16x8_t vsum01, vsum23;
896+
if( isLast ) // clip
897+
{
898+
int32x4_t vsum0 = vshlq_s32( vsum.val[0], invshift2nd );
899+
int32x4_t vsum1 = vshlq_s32( vsum.val[1], invshift2nd );
900+
int32x4_t vsum2 = vshlq_s32( vsum.val[2], invshift2nd );
901+
int32x4_t vsum3 = vshlq_s32( vsum.val[3], invshift2nd );
902+
903+
uint16x8_t usum01 = vcombine_u16( vqmovun_s32( vsum0 ), vqmovun_s32( vsum1 ) );
904+
uint16x8_t usum23 = vcombine_u16( vqmovun_s32( vsum2 ), vqmovun_s32( vsum3 ) );
905+
906+
vsum01 = vminq_s16( vibdimax, vreinterpretq_s16_u16( usum01 ) );
907+
vsum23 = vminq_s16( vibdimax, vreinterpretq_s16_u16( usum23 ) );
908+
}
909+
else
910+
{
911+
vsum01 =
912+
vcombine_s16( vqshrn_n_s32( vsum.val[0], IF_FILTER_PREC ), vqshrn_n_s32( vsum.val[1], IF_FILTER_PREC ) );
913+
vsum23 =
914+
vcombine_s16( vqshrn_n_s32( vsum.val[2], IF_FILTER_PREC ), vqshrn_n_s32( vsum.val[3], IF_FILTER_PREC ) );
915+
}
916+
917+
vst1q_s16( dst + 0, vsum01 );
918+
vst1q_s16( dst + 8, vsum23 );
919+
920+
dst += dstStride;
921+
} while( ++h != 2 );
922+
}
923+
}
924+
755925
template<int N, bool shiftBack>
756926
static void simdInterpolateHorM8_Neon( const int16_t* src, int srcStride, int16_t *dst, int dstStride, int width, int height, int shift, int offset, const ClpRng& clpRng, int16_t const *coeff )
757927
{
@@ -1460,6 +1630,8 @@ void InterpolationFilter::_initInterpolationFilterARM<NEON>()
14601630

14611631
m_filter16xH[0][0] = simdFilter16xH_N8_neon<false>;
14621632
m_filter16xH[0][1] = simdFilter16xH_N8_neon<true>;
1633+
m_filter16xH[1][0] = simdFilter16xH_N4_neon<false>;
1634+
m_filter16xH[1][1] = simdFilter16xH_N4_neon<true>;
14631635

14641636
m_filterN2_2D = simdInterpolateN2_2D_neon;
14651637

0 commit comments

Comments
 (0)