@@ -536,6 +536,14 @@ static inline int16x8_t filter_horiz_8x1_N4_neon( Pel const* src, int16x4_t ch,
536536 return vcombine_s16 ( lo, hi );
537537}
538538
539+ static inline int16x8x2_t filter_horiz_16x1_N4_neon ( Pel const * src, int16x4_t ch, int32x4_t voffset1, int32x4_t invshift1st )
540+ {
541+ int16x8x2_t result;
542+ result.val [0 ] = filter_horiz_8x1_N4_neon ( src + 0 , ch, voffset1, invshift1st );
543+ result.val [1 ] = filter_horiz_8x1_N4_neon ( src + 8 , ch, voffset1, invshift1st );
544+ return result;
545+ }
546+
539547static inline int32x4_t filter_vert_4x1_N4_neon ( int16x4_t const * vsrc, int16x4_t cv, int32x4_t voffset2 )
540548{
541549 int32x4_t vsum = vmlal_lane_s16 ( voffset2, vsrc[0 ], cv, 0 );
@@ -560,6 +568,31 @@ static inline int32x4x2_t filter_vert_8x1_N4_neon( int16x8_t const* vsrc, int16x
560568 return vsum;
561569}
562570
571+ static inline int32x4x4_t filter_vert_16x1_N4_neon ( int16x8x2_t const * vsrc, int16x4_t cv, int32x4_t voffset2 )
572+ {
573+ int32x4x4_t vsum;
574+ vsum.val [0 ] = vmlal_lane_s16 ( voffset2, vget_low_s16 ( vsrc[0 ].val [0 ] ), cv, 0 );
575+ vsum.val [0 ] = vmlal_lane_s16 ( vsum.val [0 ], vget_low_s16 ( vsrc[1 ].val [0 ] ), cv, 1 );
576+ vsum.val [0 ] = vmlal_lane_s16 ( vsum.val [0 ], vget_low_s16 ( vsrc[2 ].val [0 ] ), cv, 2 );
577+ vsum.val [0 ] = vmlal_lane_s16 ( vsum.val [0 ], vget_low_s16 ( vsrc[3 ].val [0 ] ), cv, 3 );
578+
579+ vsum.val [1 ] = vmlal_lane_s16 ( voffset2, vget_high_s16 ( vsrc[0 ].val [0 ] ), cv, 0 );
580+ vsum.val [1 ] = vmlal_lane_s16 ( vsum.val [1 ], vget_high_s16 ( vsrc[1 ].val [0 ] ), cv, 1 );
581+ vsum.val [1 ] = vmlal_lane_s16 ( vsum.val [1 ], vget_high_s16 ( vsrc[2 ].val [0 ] ), cv, 2 );
582+ vsum.val [1 ] = vmlal_lane_s16 ( vsum.val [1 ], vget_high_s16 ( vsrc[3 ].val [0 ] ), cv, 3 );
583+
584+ vsum.val [2 ] = vmlal_lane_s16 ( voffset2, vget_low_s16 ( vsrc[0 ].val [1 ] ), cv, 0 );
585+ vsum.val [2 ] = vmlal_lane_s16 ( vsum.val [2 ], vget_low_s16 ( vsrc[1 ].val [1 ] ), cv, 1 );
586+ vsum.val [2 ] = vmlal_lane_s16 ( vsum.val [2 ], vget_low_s16 ( vsrc[2 ].val [1 ] ), cv, 2 );
587+ vsum.val [2 ] = vmlal_lane_s16 ( vsum.val [2 ], vget_low_s16 ( vsrc[3 ].val [1 ] ), cv, 3 );
588+
589+ vsum.val [3 ] = vmlal_lane_s16 ( voffset2, vget_high_s16 ( vsrc[0 ].val [1 ] ), cv, 0 );
590+ vsum.val [3 ] = vmlal_lane_s16 ( vsum.val [3 ], vget_high_s16 ( vsrc[1 ].val [1 ] ), cv, 1 );
591+ vsum.val [3 ] = vmlal_lane_s16 ( vsum.val [3 ], vget_high_s16 ( vsrc[2 ].val [1 ] ), cv, 2 );
592+ vsum.val [3 ] = vmlal_lane_s16 ( vsum.val [3 ], vget_high_s16 ( vsrc[3 ].val [1 ] ), cv, 3 );
593+ return vsum;
594+ }
595+
563596template <bool isLast>
564597void simdFilter4x4_N4_neon ( const ClpRng& clpRng, Pel const * src, int srcStride, Pel* dst, int dstStride, int width,
565598 int height, TFilterCoeff const * coeffH, TFilterCoeff const * coeffV )
@@ -752,6 +785,143 @@ void simdFilter8xH_N4_neon( const ClpRng& clpRng, Pel const* src, int srcStride,
752785 }
753786}
754787
788+ template <bool isLast>
789+ void simdFilter16xH_N4_neon ( const ClpRng& clpRng, Pel const * src, int srcStride, Pel* dst, int dstStride, int width,
790+ int height, TFilterCoeff const * coeffH, TFilterCoeff const * coeffV )
791+ {
792+ CHECKD ( width != 16 , " Width must be 16" );
793+ CHECKD ( height < 2 , " Height must be >= 2" );
794+ CHECKD ( height % 4 != 0 && height != 2 , " Height must be a multiple of 4" );
795+ CHECKD ( IF_INTERNAL_PREC - clpRng.bd < 2 , " Bit depth headroom must be at least 2" );
796+
797+ OFFSET ( src, srcStride, -1 , -1 );
798+
799+ const int headRoom = IF_INTERNAL_PREC - clpRng.bd ;
800+ const int shift1st = IF_FILTER_PREC - headRoom;
801+ const int shift2nd = IF_FILTER_PREC + headRoom;
802+
803+ const int offset1st = -IF_INTERNAL_OFFS * ( 1 << shift1st );
804+ int offset2nd;
805+ if ( isLast )
806+ {
807+ offset2nd = ( 1 << ( shift2nd - 1 ) ) + ( IF_INTERNAL_OFFS << IF_FILTER_PREC );
808+ }
809+ else
810+ {
811+ offset2nd = 0 ;
812+ }
813+ const int32x4_t voffset1 = vdupq_n_s32 ( offset1st );
814+ const int32x4_t voffset2 = vdupq_n_s32 ( offset2nd );
815+ const int16x8_t vibdimax = vdupq_n_s16 ( clpRng.max () );
816+
817+ int16x4_t ch = vld1_s16 ( coeffH );
818+ int16x4_t cv = vld1_s16 ( coeffV );
819+
820+ int32x4_t invshift1st = vdupq_n_s32 ( -shift1st );
821+ int32x4_t invshift2nd = vdupq_n_s32 ( -shift2nd );
822+
823+ int16x8x2_t vsrcv[7 ];
824+ vsrcv[0 ] = filter_horiz_16x1_N4_neon ( src, ch, voffset1, invshift1st );
825+ src += srcStride;
826+ vsrcv[1 ] = filter_horiz_16x1_N4_neon ( src, ch, voffset1, invshift1st );
827+ src += srcStride;
828+ vsrcv[2 ] = filter_horiz_16x1_N4_neon ( src, ch, voffset1, invshift1st );
829+ src += srcStride;
830+
831+ if ( height >= 4 )
832+ {
833+ do
834+ {
835+ vsrcv[3 ] = filter_horiz_16x1_N4_neon ( src, ch, voffset1, invshift1st );
836+ src += srcStride;
837+ vsrcv[4 ] = filter_horiz_16x1_N4_neon ( src, ch, voffset1, invshift1st );
838+ src += srcStride;
839+ vsrcv[5 ] = filter_horiz_16x1_N4_neon ( src, ch, voffset1, invshift1st );
840+ src += srcStride;
841+ vsrcv[6 ] = filter_horiz_16x1_N4_neon ( src, ch, voffset1, invshift1st );
842+ src += srcStride;
843+
844+ int h = 0 ;
845+ do
846+ {
847+ int32x4x4_t vsum = filter_vert_16x1_N4_neon ( vsrcv + h, cv, voffset2 );
848+
849+ int16x8_t vsum01, vsum23;
850+ if ( isLast ) // clip
851+ {
852+ int32x4_t vsum0 = vshlq_s32 ( vsum.val [0 ], invshift2nd );
853+ int32x4_t vsum1 = vshlq_s32 ( vsum.val [1 ], invshift2nd );
854+ int32x4_t vsum2 = vshlq_s32 ( vsum.val [2 ], invshift2nd );
855+ int32x4_t vsum3 = vshlq_s32 ( vsum.val [3 ], invshift2nd );
856+
857+ uint16x8_t usum01 = vcombine_u16 ( vqmovun_s32 ( vsum0 ), vqmovun_s32 ( vsum1 ) );
858+ uint16x8_t usum23 = vcombine_u16 ( vqmovun_s32 ( vsum2 ), vqmovun_s32 ( vsum3 ) );
859+
860+ vsum01 = vminq_s16 ( vibdimax, vreinterpretq_s16_u16 ( usum01 ) );
861+ vsum23 = vminq_s16 ( vibdimax, vreinterpretq_s16_u16 ( usum23 ) );
862+ }
863+ else
864+ {
865+ vsum01 =
866+ vcombine_s16 ( vqshrn_n_s32 ( vsum.val [0 ], IF_FILTER_PREC ), vqshrn_n_s32 ( vsum.val [1 ], IF_FILTER_PREC ) );
867+ vsum23 =
868+ vcombine_s16 ( vqshrn_n_s32 ( vsum.val [2 ], IF_FILTER_PREC ), vqshrn_n_s32 ( vsum.val [3 ], IF_FILTER_PREC ) );
869+ }
870+
871+ vst1q_s16 ( dst + 0 , vsum01 );
872+ vst1q_s16 ( dst + 8 , vsum23 );
873+
874+ dst += dstStride;
875+ } while ( ++h != 4 );
876+
877+ vsrcv[0 ] = vsrcv[4 ];
878+ vsrcv[1 ] = vsrcv[5 ];
879+ vsrcv[2 ] = vsrcv[6 ];
880+
881+ height -= 4 ;
882+ } while ( height != 0 );
883+ }
884+ else // height == 2
885+ {
886+ vsrcv[3 ] = filter_horiz_16x1_N4_neon ( src, ch, voffset1, invshift1st );
887+ src += srcStride;
888+ vsrcv[4 ] = filter_horiz_16x1_N4_neon ( src, ch, voffset1, invshift1st );
889+
890+ int h = 0 ;
891+ do
892+ {
893+ int32x4x4_t vsum = filter_vert_16x1_N4_neon ( vsrcv + h, cv, voffset2 );
894+
895+ int16x8_t vsum01, vsum23;
896+ if ( isLast ) // clip
897+ {
898+ int32x4_t vsum0 = vshlq_s32 ( vsum.val [0 ], invshift2nd );
899+ int32x4_t vsum1 = vshlq_s32 ( vsum.val [1 ], invshift2nd );
900+ int32x4_t vsum2 = vshlq_s32 ( vsum.val [2 ], invshift2nd );
901+ int32x4_t vsum3 = vshlq_s32 ( vsum.val [3 ], invshift2nd );
902+
903+ uint16x8_t usum01 = vcombine_u16 ( vqmovun_s32 ( vsum0 ), vqmovun_s32 ( vsum1 ) );
904+ uint16x8_t usum23 = vcombine_u16 ( vqmovun_s32 ( vsum2 ), vqmovun_s32 ( vsum3 ) );
905+
906+ vsum01 = vminq_s16 ( vibdimax, vreinterpretq_s16_u16 ( usum01 ) );
907+ vsum23 = vminq_s16 ( vibdimax, vreinterpretq_s16_u16 ( usum23 ) );
908+ }
909+ else
910+ {
911+ vsum01 =
912+ vcombine_s16 ( vqshrn_n_s32 ( vsum.val [0 ], IF_FILTER_PREC ), vqshrn_n_s32 ( vsum.val [1 ], IF_FILTER_PREC ) );
913+ vsum23 =
914+ vcombine_s16 ( vqshrn_n_s32 ( vsum.val [2 ], IF_FILTER_PREC ), vqshrn_n_s32 ( vsum.val [3 ], IF_FILTER_PREC ) );
915+ }
916+
917+ vst1q_s16 ( dst + 0 , vsum01 );
918+ vst1q_s16 ( dst + 8 , vsum23 );
919+
920+ dst += dstStride;
921+ } while ( ++h != 2 );
922+ }
923+ }
924+
755925template <int N, bool shiftBack>
756926static void simdInterpolateHorM8_Neon ( const int16_t * src, int srcStride, int16_t *dst, int dstStride, int width, int height, int shift, int offset, const ClpRng& clpRng, int16_t const *coeff )
757927{
@@ -1460,6 +1630,8 @@ void InterpolationFilter::_initInterpolationFilterARM<NEON>()
14601630
14611631 m_filter16xH[0 ][0 ] = simdFilter16xH_N8_neon<false >;
14621632 m_filter16xH[0 ][1 ] = simdFilter16xH_N8_neon<true >;
1633+ m_filter16xH[1 ][0 ] = simdFilter16xH_N4_neon<false >;
1634+ m_filter16xH[1 ][1 ] = simdFilter16xH_N4_neon<true >;
14631635
14641636 m_filterN2_2D = simdInterpolateN2_2D_neon;
14651637
0 commit comments