@@ -47,18 +47,21 @@ POSSIBILITY OF SUCH DAMAGE.
4747// ====================================================================================================================
4848// Includes
4949// ====================================================================================================================
50- #include " ../InterpolationFilter_neon.h"
51- #include " ../CommonDefARM.h"
50+
5251#include " CommonLib/CommonDef.h"
5352#include " CommonLib/InterpolationFilter.h"
54- #include " neon_sve_bridge.h"
55- #include " ../neon/transpose_neon.h"
5653
5754// ! \ingroup CommonLib
5855// ! \{
5956
6057#if defined( TARGET_SIMD_ARM ) && ENABLE_SIMD_OPT_MCIF
6158
59+ #include " ../CommonDefARM.h"
60+ #include " ../InterpolationFilter_neon.h"
61+ #include " ../mem_neon.h"
62+ #include " ../neon/transpose_neon.h"
63+ #include " neon_sve_bridge.h"
64+
6265namespace vvenc
6366{
6467
@@ -951,6 +954,329 @@ void simdFilter16xH_N4_sve( const ClpRng& clpRng, Pel const* src, int srcStride,
951954 }
952955}
953956
957+ template <bool isLast>
958+ void simdInterpolateHor_N8_sve ( const int16_t * src, int srcStride, int16_t * dst, int dstStride, int width, int height,
959+ int shift, int offset, const ClpRng& clpRng, int16_t const * coeff )
960+ {
961+ CHECKD ( width % 4 != 0 , " Width must be a multiple of 4!" );
962+
963+ const int16x8_t vibdimax = vdupq_n_s16 ( clpRng.max () );
964+ const int32x4_t invshift = vdupq_n_s32 ( -shift );
965+
966+ const int16x8_t vcoeff = vld1q_s16 ( coeff );
967+ const int32x4_t voffset = vdupq_n_s32 ( offset );
968+
969+ do
970+ {
971+ int col = 0 ;
972+ for ( ; col + 8 <= width; col += 8 )
973+ {
974+ int16x8_t vsrc0 = vld1q_s16 ( &src[col + 0 ] );
975+ int16x8_t vsrc1 = vld1q_s16 ( &src[col + 1 ] );
976+ int16x8_t vsrc2 = vld1q_s16 ( &src[col + 2 ] );
977+ int16x8_t vsrc3 = vld1q_s16 ( &src[col + 3 ] );
978+ int16x8_t vsrc4 = vld1q_s16 ( &src[col + 4 ] );
979+ int16x8_t vsrc5 = vld1q_s16 ( &src[col + 5 ] );
980+ int16x8_t vsrc6 = vld1q_s16 ( &src[col + 6 ] );
981+ int16x8_t vsrc7 = vld1q_s16 ( &src[col + 7 ] );
982+
983+ int64x2_t vsum0 = vvenc_sdotq_s16 ( vdupq_n_s64 ( 0 ), vsrc0, vcoeff );
984+ int64x2_t vsum1 = vvenc_sdotq_s16 ( vdupq_n_s64 ( 0 ), vsrc1, vcoeff );
985+ int64x2_t vsum2 = vvenc_sdotq_s16 ( vdupq_n_s64 ( 0 ), vsrc2, vcoeff );
986+ int64x2_t vsum3 = vvenc_sdotq_s16 ( vdupq_n_s64 ( 0 ), vsrc3, vcoeff );
987+ int64x2_t vsum4 = vvenc_sdotq_s16 ( vdupq_n_s64 ( 0 ), vsrc4, vcoeff );
988+ int64x2_t vsum5 = vvenc_sdotq_s16 ( vdupq_n_s64 ( 0 ), vsrc5, vcoeff );
989+ int64x2_t vsum6 = vvenc_sdotq_s16 ( vdupq_n_s64 ( 0 ), vsrc6, vcoeff );
990+ int64x2_t vsum7 = vvenc_sdotq_s16 ( vdupq_n_s64 ( 0 ), vsrc7, vcoeff );
991+
992+ int64x2_t vsum01 = vpaddq_s64 ( vsum0, vsum1 );
993+ int64x2_t vsum23 = vpaddq_s64 ( vsum2, vsum3 );
994+ int64x2_t vsum45 = vpaddq_s64 ( vsum4, vsum5 );
995+ int64x2_t vsum67 = vpaddq_s64 ( vsum6, vsum7 );
996+
997+ int32x4_t vsuma = vcombine_s32 ( vmovn_s64 ( vsum01 ), vmovn_s64 ( vsum23 ) );
998+ int32x4_t vsumb = vcombine_s32 ( vmovn_s64 ( vsum45 ), vmovn_s64 ( vsum67 ) );
999+
1000+ vsuma = vaddq_s32 ( vsuma, voffset );
1001+ vsumb = vaddq_s32 ( vsumb, voffset );
1002+
1003+ vsuma = vshlq_s32 ( vsuma, invshift );
1004+ vsumb = vshlq_s32 ( vsumb, invshift );
1005+
1006+ int16x8_t vsum = pack_sum_s32_to_s16x8<isLast>( vsuma, vsumb, vibdimax );
1007+
1008+ vst1q_s16 ( &dst[col], vsum );
1009+ }
1010+ if ( col != width ) // Last four samples.
1011+ {
1012+ int16x8_t vsrc0 = vld1q_s16 ( &src[col + 0 ] );
1013+ int16x8_t vsrc1 = vld1q_s16 ( &src[col + 1 ] );
1014+ int16x8_t vsrc2 = vld1q_s16 ( &src[col + 2 ] );
1015+ int16x8_t vsrc3 = vld1q_s16 ( &src[col + 3 ] );
1016+
1017+ int64x2_t vsum0 = vvenc_sdotq_s16 ( vdupq_n_s64 ( 0 ), vsrc0, vcoeff );
1018+ int64x2_t vsum1 = vvenc_sdotq_s16 ( vdupq_n_s64 ( 0 ), vsrc1, vcoeff );
1019+ int64x2_t vsum2 = vvenc_sdotq_s16 ( vdupq_n_s64 ( 0 ), vsrc2, vcoeff );
1020+ int64x2_t vsum3 = vvenc_sdotq_s16 ( vdupq_n_s64 ( 0 ), vsrc3, vcoeff );
1021+
1022+ int64x2_t vsum01 = vpaddq_s64 ( vsum0, vsum1 );
1023+ int64x2_t vsum23 = vpaddq_s64 ( vsum2, vsum3 );
1024+
1025+ int32x4_t vsuma = vcombine_s32 ( vmovn_s64 ( vsum01 ), vmovn_s64 ( vsum23 ) );
1026+ vsuma = vaddq_s32 ( vsuma, voffset );
1027+ vsuma = vshlq_s32 ( vsuma, invshift );
1028+
1029+ int16x4_t vsum = pack_sum_s32_to_s16x4<isLast>( vsuma, vget_low_s16 ( vibdimax ) );
1030+
1031+ vst1_s16 ( &dst[col], vsum );
1032+ }
1033+
1034+ src += srcStride;
1035+ dst += dstStride;
1036+ } while ( --height != 0 );
1037+ }
1038+
1039+ template <bool isLast>
1040+ void simdInterpolateHor_N4_sve ( const int16_t * src, int srcStride, int16_t * dst, int dstStride, int width, int height,
1041+ int shift, int offset, const ClpRng& clpRng, int16_t const * coeff )
1042+ {
1043+ CHECKD ( width % 4 != 0 , " Width must be a multiple of 4!" );
1044+
1045+ const int16x8_t vibdimax = vdupq_n_s16 ( clpRng.max () );
1046+ const int32x4_t invshift = vdupq_n_s32 ( -shift );
1047+
1048+ const int16x8_t vcoeffx2 = vreinterpretq_s16_u64 ( vld1q_dup_u64 ( ( const uint64_t * )coeff ) );
1049+ const int32x4_t voffset = vdupq_n_s32 ( offset );
1050+
1051+ do
1052+ {
1053+ int col = 0 ;
1054+ for ( ; col + 8 <= width; col += 8 )
1055+ {
1056+ int16x8_t vsrc04 = vld1q_s16 ( &src[col + 0 ] );
1057+ int16x8_t vsrc15 = vld1q_s16 ( &src[col + 1 ] );
1058+ int16x8_t vsrc26 = vld1q_s16 ( &src[col + 2 ] );
1059+ int16x8_t vsrc37 = vld1q_s16 ( &src[col + 3 ] );
1060+
1061+ int64x2_t vsum04 = vvenc_sdotq_s16 ( vdupq_n_s64 ( 0 ), vsrc04, vcoeffx2 );
1062+ int64x2_t vsum15 = vvenc_sdotq_s16 ( vdupq_n_s64 ( 0 ), vsrc15, vcoeffx2 );
1063+ int64x2_t vsum26 = vvenc_sdotq_s16 ( vdupq_n_s64 ( 0 ), vsrc26, vcoeffx2 );
1064+ int64x2_t vsum37 = vvenc_sdotq_s16 ( vdupq_n_s64 ( 0 ), vsrc37, vcoeffx2 );
1065+
1066+ int32x4_t vsum0426 = vcombine_s32 ( vmovn_s64 ( vsum04 ), vmovn_s64 ( vsum26 ) );
1067+ int32x4_t vsum1537 = vcombine_s32 ( vmovn_s64 ( vsum15 ), vmovn_s64 ( vsum37 ) );
1068+
1069+ int32x4_t vsuma = vtrn1q_s32 ( vsum0426, vsum1537 );
1070+ int32x4_t vsumb = vtrn2q_s32 ( vsum0426, vsum1537 );
1071+
1072+ vsuma = vaddq_s32 ( vsuma, voffset );
1073+ vsumb = vaddq_s32 ( vsumb, voffset );
1074+
1075+ vsuma = vshlq_s32 ( vsuma, invshift );
1076+ vsumb = vshlq_s32 ( vsumb, invshift );
1077+
1078+ int16x8_t vsum = pack_sum_s32_to_s16x8<isLast>( vsuma, vsumb, vibdimax );
1079+
1080+ vst1q_s16 ( &dst[col], vsum );
1081+ }
1082+ if ( col != width ) // Last four samples.
1083+ {
1084+ int16x4_t vsrc0 = vld1_s16 ( &src[col + 0 ] );
1085+ int16x4_t vsrc1 = vld1_s16 ( &src[col + 1 ] );
1086+ int16x4_t vsrc2 = vld1_s16 ( &src[col + 2 ] );
1087+ int16x4_t vsrc3 = vld1_s16 ( &src[col + 3 ] );
1088+
1089+ int16x8_t vsrc01 = vcombine_s16 ( vsrc0, vsrc1 );
1090+ int16x8_t vsrc23 = vcombine_s16 ( vsrc2, vsrc3 );
1091+ int64x2_t vsum01 = vvenc_sdotq_s16 ( vdupq_n_s64 ( 0 ), vsrc01, vcoeffx2 );
1092+ int64x2_t vsum23 = vvenc_sdotq_s16 ( vdupq_n_s64 ( 0 ), vsrc23, vcoeffx2 );
1093+
1094+ int32x4_t vsuma = vcombine_s32 ( vmovn_s64 ( vsum01 ), vmovn_s64 ( vsum23 ) );
1095+ vsuma = vaddq_s32 ( vsuma, voffset );
1096+ vsuma = vshlq_s32 ( vsuma, invshift );
1097+
1098+ int16x4_t vsum = pack_sum_s32_to_s16x4<isLast>( vsuma, vget_low_s16 ( vibdimax ) );
1099+
1100+ vst1_s16 ( &dst[col], vsum );
1101+ }
1102+
1103+ src += srcStride;
1104+ dst += dstStride;
1105+ } while ( --height != 0 );
1106+ }
1107+
1108+ template <bool isLast>
1109+ void simdInterpolateHorM2_N4_sve ( const int16_t * src, int srcStride, int16_t * dst, int dstStride, int width, int height,
1110+ int shift, int offset, const ClpRng& clpRng, int16_t const * coeff )
1111+ {
1112+ CHECKD ( width != 2 , " Width must be two!" );
1113+
1114+ const int16x4_t vibdimax = vdup_n_s16 ( clpRng.max () );
1115+ const int32x4_t invshift = vdupq_n_s32 ( -shift );
1116+
1117+ const int16x8_t vcoeffx2 = vreinterpretq_s16_u64 ( vld1q_dup_u64 ( ( const uint64_t * )coeff ) );
1118+ const int32x4_t voffset = vdupq_n_s32 ( offset );
1119+
1120+ int row = 0 ;
1121+ for ( ; row + 2 <= height; row += 2 )
1122+ {
1123+ int16x4_t vsrc0 = vld1_s16 ( &src[0 ] );
1124+ int16x4_t vsrc1 = vld1_s16 ( &src[1 ] );
1125+ int16x4_t vsrc2 = vld1_s16 ( &src[srcStride + 0 ] );
1126+ int16x4_t vsrc3 = vld1_s16 ( &src[srcStride + 1 ] );
1127+
1128+ int16x8_t vsrc01 = vcombine_s16 ( vsrc0, vsrc1 );
1129+ int16x8_t vsrc23 = vcombine_s16 ( vsrc2, vsrc3 );
1130+ int64x2_t vsum01 = vvenc_sdotq_s16 ( vdupq_n_s64 ( 0 ), vsrc01, vcoeffx2 );
1131+ int64x2_t vsum23 = vvenc_sdotq_s16 ( vdupq_n_s64 ( 0 ), vsrc23, vcoeffx2 );
1132+
1133+ int32x4_t vsuma = vcombine_s32 ( vmovn_s64 ( vsum01 ), vmovn_s64 ( vsum23 ) );
1134+ vsuma = vaddq_s32 ( vsuma, voffset );
1135+ vsuma = vshlq_s32 ( vsuma, invshift );
1136+
1137+ int16x4_t vsum = pack_sum_s32_to_s16x4<isLast>( vsuma, vibdimax );
1138+
1139+ store_s16x2x2 ( &dst[0 ], vsum, dstStride );
1140+
1141+ src += 2 * srcStride;
1142+ dst += 2 * dstStride;
1143+ }
1144+ if ( row != height )
1145+ {
1146+ int16x4_t vsrc0 = vld1_s16 ( &src[0 ] );
1147+ int16x4_t vsrc1 = vld1_s16 ( &src[1 ] );
1148+
1149+ int16x8_t vsrc01 = vcombine_s16 ( vsrc0, vsrc1 );
1150+ int64x2_t vsum01 = vvenc_sdotq_s16 ( vdupq_n_s64 ( 0 ), vsrc01, vcoeffx2 );
1151+
1152+ int32x4_t vsuma = vcombine_s32 ( vmovn_s64 ( vsum01 ), vdup_n_s32 ( 0 ) );
1153+ vsuma = vaddq_s32 ( vsuma, voffset );
1154+ vsuma = vshlq_s32 ( vsuma, invshift );
1155+
1156+ int16x4_t vsum = pack_sum_s32_to_s16x4<isLast>( vsuma, vibdimax );
1157+
1158+ store_s16x2 ( &dst[0 ], vsum );
1159+ }
1160+ }
1161+
1162+ template <int N, bool isFirst, bool isLast>
1163+ void simdFilterHor_sve ( const ClpRng& clpRng, Pel const * src, int srcStride, Pel* dst, int dstStride, int width,
1164+ int height, TFilterCoeff const * coeff )
1165+ {
1166+ static_assert ( N == 4 || N == 6 || N == 8 , " Supported taps: 4/6/8" );
1167+ CHECKD ( height < 1 , " Height must be >= 1!" );
1168+ CHECKD ( width < 1 , " Width must be >= 1!" );
1169+ CHECKD ( clpRng.bd > 10 , " VVenC does not support bitdepths larger than 10!" );
1170+ CHECKD ( IF_INTERNAL_PREC - clpRng.bd < 2 , " Bit depth headroom must be at least 2" );
1171+
1172+ const int16_t * c = coeff;
1173+
1174+ const int cStride = 1 ; // Horizontal mode.
1175+ src -= ( N / 2 - 1 ) * cStride;
1176+
1177+ int offset;
1178+ int headRoom = IF_INTERNAL_PREC - clpRng.bd ;
1179+ int shift = IF_FILTER_PREC;
1180+
1181+ // Set shift and offset for N != 2 case.
1182+ if ( isLast )
1183+ {
1184+ shift += isFirst ? 0 : headRoom;
1185+ offset = 1 << ( shift - 1 );
1186+ offset += isFirst ? 0 : IF_INTERNAL_OFFS << IF_FILTER_PREC;
1187+ }
1188+ else
1189+ {
1190+ shift -= isFirst ? headRoom : 0 ;
1191+ offset = isFirst ? -IF_INTERNAL_OFFS * ( 1 << shift ) : 0 ;
1192+ }
1193+
1194+ if ( N == 6 )
1195+ {
1196+ CHECKD ( width % 4 != 0 && width != 1 , " N6 width must be 1 or multiple of 4! width=" << width );
1197+ CHECKD ( coeff[0 ] != 0 || coeff[7 ] != 0 , " 0th and 7th coeff must be zero for 6-tap!" );
1198+
1199+ if ( width % 4 == 0 )
1200+ {
1201+ src -= 1 ; // Use 8-tap filter, but offset src by -1 since coeff[0] is always zero.
1202+ simdInterpolateHor_N8_sve<isLast>( src, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c );
1203+ }
1204+ else // width == 1
1205+ {
1206+ c = coeff + 1 ;
1207+ goto scalar_hor_m1;
1208+ }
1209+
1210+ return ;
1211+ }
1212+
1213+ if ( N == 8 )
1214+ {
1215+ CHECKD ( width % 4 != 0 && width != 1 , " N8 width must be 1 or multiple of 4! width=" << width );
1216+
1217+ if ( width % 4 == 0 )
1218+ {
1219+ simdInterpolateHor_N8_sve<isLast>( src, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c );
1220+ }
1221+ else // width == 1
1222+ {
1223+ goto scalar_hor_m1;
1224+ }
1225+
1226+ return ;
1227+ }
1228+
1229+ if ( N == 4 )
1230+ {
1231+ CHECKD ( width % 4 != 0 && width != 2 && width != 1 , " N4 width must be 1 or 2 or multiple of 4! width=" << width );
1232+
1233+ if ( width % 4 == 0 )
1234+ {
1235+ simdInterpolateHor_N4_sve<isLast>( src, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c );
1236+ }
1237+ else if ( width == 2 )
1238+ {
1239+ simdInterpolateHorM2_N4_sve<isLast>( src, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c );
1240+ }
1241+ else // width == 1
1242+ {
1243+ goto scalar_hor_m1;
1244+ }
1245+
1246+ return ;
1247+ }
1248+
1249+ scalar_hor_m1: // Use scalar code for width == 1.
1250+ CHECKD ( width != 1 , " Width must be 1!" );
1251+ do
1252+ {
1253+ int sum = src[0 ] * c[0 ];
1254+ sum += src[1 ] * c[1 ];
1255+ sum += src[2 ] * c[2 ];
1256+ sum += src[3 ] * c[3 ];
1257+ if ( N >= 6 )
1258+ {
1259+ sum += src[4 ] * c[4 ];
1260+ sum += src[5 ] * c[5 ];
1261+ }
1262+ if ( N == 8 )
1263+ {
1264+ sum += src[6 ] * c[6 ];
1265+ sum += src[7 ] * c[7 ];
1266+ }
1267+
1268+ Pel val = ( sum + offset ) >> shift;
1269+ if ( isLast )
1270+ {
1271+ val = ClipPel ( val, clpRng );
1272+ }
1273+ dst[0 ] = val;
1274+
1275+ src += srcStride;
1276+ dst += dstStride;
1277+ } while ( --height != 0 );
1278+ }
1279+
9541280template <>
9551281void InterpolationFilter::_initInterpolationFilterARM<SVE>()
9561282{
@@ -968,6 +1294,22 @@ void InterpolationFilter::_initInterpolationFilterARM<SVE>()
9681294 m_filter16xH[0 ][1 ] = simdFilter16xH_N8_sve<true >;
9691295 m_filter16xH[1 ][0 ] = simdFilter16xH_N4_sve<false >;
9701296 m_filter16xH[1 ][1 ] = simdFilter16xH_N4_sve<true >;
1297+
1298+ // SVE is only beneficial for the horizontal filtering of the 4-tap, 6-tap, and 8-tap filters.
1299+ m_filterHor[0 ][0 ][0 ] = simdFilterHor_sve<8 , false , false >;
1300+ m_filterHor[0 ][0 ][1 ] = simdFilterHor_sve<8 , false , true >;
1301+ m_filterHor[0 ][1 ][0 ] = simdFilterHor_sve<8 , true , false >;
1302+ m_filterHor[0 ][1 ][1 ] = simdFilterHor_sve<8 , true , true >;
1303+
1304+ m_filterHor[1 ][0 ][0 ] = simdFilterHor_sve<4 , false , false >;
1305+ m_filterHor[1 ][0 ][1 ] = simdFilterHor_sve<4 , false , true >;
1306+ m_filterHor[1 ][1 ][0 ] = simdFilterHor_sve<4 , true , false >;
1307+ m_filterHor[1 ][1 ][1 ] = simdFilterHor_sve<4 , true , true >;
1308+
1309+ m_filterHor[3 ][0 ][0 ] = simdFilterHor_sve<6 , false , false >;
1310+ m_filterHor[3 ][0 ][1 ] = simdFilterHor_sve<6 , false , true >;
1311+ m_filterHor[3 ][1 ][0 ] = simdFilterHor_sve<6 , true , false >;
1312+ m_filterHor[3 ][1 ][1 ] = simdFilterHor_sve<6 , true , true >;
9711313}
9721314
9731315} // namespace vvenc
0 commit comments