Skip to content

Commit a0ce280

Browse files
committed
Arm: Add SVE implementations of simdFilterHor
Add SVE implementations of the `simdInterpolateHor` filters for the 4-tap, 6-tap, and 8-tap. Performance uplift vs Neon: N8: 1.36x N6: 1.09x N4: 1.09x These benchmarks are obtained from a Neoverse V2 using LLVM 21.
1 parent 421602d commit a0ce280

File tree

1 file changed

+346
-4
lines changed

1 file changed

+346
-4
lines changed

source/Lib/CommonLib/arm/sve/InterpolationFilter_sve.cpp

Lines changed: 346 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -47,18 +47,21 @@ POSSIBILITY OF SUCH DAMAGE.
4747
// ====================================================================================================================
4848
// Includes
4949
// ====================================================================================================================
50-
#include "../InterpolationFilter_neon.h"
51-
#include "../CommonDefARM.h"
50+
5251
#include "CommonLib/CommonDef.h"
5352
#include "CommonLib/InterpolationFilter.h"
54-
#include "neon_sve_bridge.h"
55-
#include "../neon/transpose_neon.h"
5653

5754
//! \ingroup CommonLib
5855
//! \{
5956

6057
#if defined( TARGET_SIMD_ARM ) && ENABLE_SIMD_OPT_MCIF
6158

59+
#include "../CommonDefARM.h"
60+
#include "../InterpolationFilter_neon.h"
61+
#include "../mem_neon.h"
62+
#include "../neon/transpose_neon.h"
63+
#include "neon_sve_bridge.h"
64+
6265
namespace vvenc
6366
{
6467

@@ -951,6 +954,329 @@ void simdFilter16xH_N4_sve( const ClpRng& clpRng, Pel const* src, int srcStride,
951954
}
952955
}
953956

957+
template<bool isLast>
958+
void simdInterpolateHor_N8_sve( const int16_t* src, int srcStride, int16_t* dst, int dstStride, int width, int height,
959+
int shift, int offset, const ClpRng& clpRng, int16_t const* coeff )
960+
{
961+
CHECKD( width % 4 != 0, "Width must be a multiple of 4!" );
962+
963+
const int16x8_t vibdimax = vdupq_n_s16( clpRng.max() );
964+
const int32x4_t invshift = vdupq_n_s32( -shift );
965+
966+
const int16x8_t vcoeff = vld1q_s16( coeff );
967+
const int32x4_t voffset = vdupq_n_s32( offset );
968+
969+
do
970+
{
971+
int col = 0;
972+
for( ; col + 8 <= width; col += 8 )
973+
{
974+
int16x8_t vsrc0 = vld1q_s16( &src[col + 0] );
975+
int16x8_t vsrc1 = vld1q_s16( &src[col + 1] );
976+
int16x8_t vsrc2 = vld1q_s16( &src[col + 2] );
977+
int16x8_t vsrc3 = vld1q_s16( &src[col + 3] );
978+
int16x8_t vsrc4 = vld1q_s16( &src[col + 4] );
979+
int16x8_t vsrc5 = vld1q_s16( &src[col + 5] );
980+
int16x8_t vsrc6 = vld1q_s16( &src[col + 6] );
981+
int16x8_t vsrc7 = vld1q_s16( &src[col + 7] );
982+
983+
int64x2_t vsum0 = vvenc_sdotq_s16( vdupq_n_s64( 0 ), vsrc0, vcoeff );
984+
int64x2_t vsum1 = vvenc_sdotq_s16( vdupq_n_s64( 0 ), vsrc1, vcoeff );
985+
int64x2_t vsum2 = vvenc_sdotq_s16( vdupq_n_s64( 0 ), vsrc2, vcoeff );
986+
int64x2_t vsum3 = vvenc_sdotq_s16( vdupq_n_s64( 0 ), vsrc3, vcoeff );
987+
int64x2_t vsum4 = vvenc_sdotq_s16( vdupq_n_s64( 0 ), vsrc4, vcoeff );
988+
int64x2_t vsum5 = vvenc_sdotq_s16( vdupq_n_s64( 0 ), vsrc5, vcoeff );
989+
int64x2_t vsum6 = vvenc_sdotq_s16( vdupq_n_s64( 0 ), vsrc6, vcoeff );
990+
int64x2_t vsum7 = vvenc_sdotq_s16( vdupq_n_s64( 0 ), vsrc7, vcoeff );
991+
992+
int64x2_t vsum01 = vpaddq_s64( vsum0, vsum1 );
993+
int64x2_t vsum23 = vpaddq_s64( vsum2, vsum3 );
994+
int64x2_t vsum45 = vpaddq_s64( vsum4, vsum5 );
995+
int64x2_t vsum67 = vpaddq_s64( vsum6, vsum7 );
996+
997+
int32x4_t vsuma = vcombine_s32( vmovn_s64( vsum01 ), vmovn_s64( vsum23 ) );
998+
int32x4_t vsumb = vcombine_s32( vmovn_s64( vsum45 ), vmovn_s64( vsum67 ) );
999+
1000+
vsuma = vaddq_s32( vsuma, voffset );
1001+
vsumb = vaddq_s32( vsumb, voffset );
1002+
1003+
vsuma = vshlq_s32( vsuma, invshift );
1004+
vsumb = vshlq_s32( vsumb, invshift );
1005+
1006+
int16x8_t vsum = pack_sum_s32_to_s16x8<isLast>( vsuma, vsumb, vibdimax );
1007+
1008+
vst1q_s16( &dst[col], vsum );
1009+
}
1010+
if( col != width ) // Last four samples.
1011+
{
1012+
int16x8_t vsrc0 = vld1q_s16( &src[col + 0] );
1013+
int16x8_t vsrc1 = vld1q_s16( &src[col + 1] );
1014+
int16x8_t vsrc2 = vld1q_s16( &src[col + 2] );
1015+
int16x8_t vsrc3 = vld1q_s16( &src[col + 3] );
1016+
1017+
int64x2_t vsum0 = vvenc_sdotq_s16( vdupq_n_s64( 0 ), vsrc0, vcoeff );
1018+
int64x2_t vsum1 = vvenc_sdotq_s16( vdupq_n_s64( 0 ), vsrc1, vcoeff );
1019+
int64x2_t vsum2 = vvenc_sdotq_s16( vdupq_n_s64( 0 ), vsrc2, vcoeff );
1020+
int64x2_t vsum3 = vvenc_sdotq_s16( vdupq_n_s64( 0 ), vsrc3, vcoeff );
1021+
1022+
int64x2_t vsum01 = vpaddq_s64( vsum0, vsum1 );
1023+
int64x2_t vsum23 = vpaddq_s64( vsum2, vsum3 );
1024+
1025+
int32x4_t vsuma = vcombine_s32( vmovn_s64( vsum01 ), vmovn_s64( vsum23 ) );
1026+
vsuma = vaddq_s32( vsuma, voffset );
1027+
vsuma = vshlq_s32( vsuma, invshift );
1028+
1029+
int16x4_t vsum = pack_sum_s32_to_s16x4<isLast>( vsuma, vget_low_s16( vibdimax ) );
1030+
1031+
vst1_s16( &dst[col], vsum );
1032+
}
1033+
1034+
src += srcStride;
1035+
dst += dstStride;
1036+
} while( --height != 0 );
1037+
}
1038+
1039+
template<bool isLast>
1040+
void simdInterpolateHor_N4_sve( const int16_t* src, int srcStride, int16_t* dst, int dstStride, int width, int height,
1041+
int shift, int offset, const ClpRng& clpRng, int16_t const* coeff )
1042+
{
1043+
CHECKD( width % 4 != 0, "Width must be a multiple of 4!" );
1044+
1045+
const int16x8_t vibdimax = vdupq_n_s16( clpRng.max() );
1046+
const int32x4_t invshift = vdupq_n_s32( -shift );
1047+
1048+
const int16x8_t vcoeffx2 = vreinterpretq_s16_u64( vld1q_dup_u64( ( const uint64_t* )coeff ) );
1049+
const int32x4_t voffset = vdupq_n_s32( offset );
1050+
1051+
do
1052+
{
1053+
int col = 0;
1054+
for( ; col + 8 <= width; col += 8 )
1055+
{
1056+
int16x8_t vsrc04 = vld1q_s16( &src[col + 0] );
1057+
int16x8_t vsrc15 = vld1q_s16( &src[col + 1] );
1058+
int16x8_t vsrc26 = vld1q_s16( &src[col + 2] );
1059+
int16x8_t vsrc37 = vld1q_s16( &src[col + 3] );
1060+
1061+
int64x2_t vsum04 = vvenc_sdotq_s16( vdupq_n_s64( 0 ), vsrc04, vcoeffx2 );
1062+
int64x2_t vsum15 = vvenc_sdotq_s16( vdupq_n_s64( 0 ), vsrc15, vcoeffx2 );
1063+
int64x2_t vsum26 = vvenc_sdotq_s16( vdupq_n_s64( 0 ), vsrc26, vcoeffx2 );
1064+
int64x2_t vsum37 = vvenc_sdotq_s16( vdupq_n_s64( 0 ), vsrc37, vcoeffx2 );
1065+
1066+
int32x4_t vsum0426 = vcombine_s32( vmovn_s64( vsum04 ), vmovn_s64( vsum26 ) );
1067+
int32x4_t vsum1537 = vcombine_s32( vmovn_s64( vsum15 ), vmovn_s64( vsum37 ) );
1068+
1069+
int32x4_t vsuma = vtrn1q_s32( vsum0426, vsum1537 );
1070+
int32x4_t vsumb = vtrn2q_s32( vsum0426, vsum1537 );
1071+
1072+
vsuma = vaddq_s32( vsuma, voffset );
1073+
vsumb = vaddq_s32( vsumb, voffset );
1074+
1075+
vsuma = vshlq_s32( vsuma, invshift );
1076+
vsumb = vshlq_s32( vsumb, invshift );
1077+
1078+
int16x8_t vsum = pack_sum_s32_to_s16x8<isLast>( vsuma, vsumb, vibdimax );
1079+
1080+
vst1q_s16( &dst[col], vsum );
1081+
}
1082+
if( col != width ) // Last four samples.
1083+
{
1084+
int16x4_t vsrc0 = vld1_s16( &src[col + 0] );
1085+
int16x4_t vsrc1 = vld1_s16( &src[col + 1] );
1086+
int16x4_t vsrc2 = vld1_s16( &src[col + 2] );
1087+
int16x4_t vsrc3 = vld1_s16( &src[col + 3] );
1088+
1089+
int16x8_t vsrc01 = vcombine_s16( vsrc0, vsrc1 );
1090+
int16x8_t vsrc23 = vcombine_s16( vsrc2, vsrc3 );
1091+
int64x2_t vsum01 = vvenc_sdotq_s16( vdupq_n_s64( 0 ), vsrc01, vcoeffx2 );
1092+
int64x2_t vsum23 = vvenc_sdotq_s16( vdupq_n_s64( 0 ), vsrc23, vcoeffx2 );
1093+
1094+
int32x4_t vsuma = vcombine_s32( vmovn_s64( vsum01 ), vmovn_s64( vsum23 ) );
1095+
vsuma = vaddq_s32( vsuma, voffset );
1096+
vsuma = vshlq_s32( vsuma, invshift );
1097+
1098+
int16x4_t vsum = pack_sum_s32_to_s16x4<isLast>( vsuma, vget_low_s16( vibdimax ) );
1099+
1100+
vst1_s16( &dst[col], vsum );
1101+
}
1102+
1103+
src += srcStride;
1104+
dst += dstStride;
1105+
} while( --height != 0 );
1106+
}
1107+
1108+
template<bool isLast>
1109+
void simdInterpolateHorM2_N4_sve( const int16_t* src, int srcStride, int16_t* dst, int dstStride, int width, int height,
1110+
int shift, int offset, const ClpRng& clpRng, int16_t const* coeff )
1111+
{
1112+
CHECKD( width != 2, "Width must be two!" );
1113+
1114+
const int16x4_t vibdimax = vdup_n_s16( clpRng.max() );
1115+
const int32x4_t invshift = vdupq_n_s32( -shift );
1116+
1117+
const int16x8_t vcoeffx2 = vreinterpretq_s16_u64( vld1q_dup_u64( ( const uint64_t* )coeff ) );
1118+
const int32x4_t voffset = vdupq_n_s32( offset );
1119+
1120+
int row = 0;
1121+
for( ; row + 2 <= height; row += 2 )
1122+
{
1123+
int16x4_t vsrc0 = vld1_s16( &src[0] );
1124+
int16x4_t vsrc1 = vld1_s16( &src[1] );
1125+
int16x4_t vsrc2 = vld1_s16( &src[srcStride + 0] );
1126+
int16x4_t vsrc3 = vld1_s16( &src[srcStride + 1] );
1127+
1128+
int16x8_t vsrc01 = vcombine_s16( vsrc0, vsrc1 );
1129+
int16x8_t vsrc23 = vcombine_s16( vsrc2, vsrc3 );
1130+
int64x2_t vsum01 = vvenc_sdotq_s16( vdupq_n_s64( 0 ), vsrc01, vcoeffx2 );
1131+
int64x2_t vsum23 = vvenc_sdotq_s16( vdupq_n_s64( 0 ), vsrc23, vcoeffx2 );
1132+
1133+
int32x4_t vsuma = vcombine_s32( vmovn_s64( vsum01 ), vmovn_s64( vsum23 ) );
1134+
vsuma = vaddq_s32( vsuma, voffset );
1135+
vsuma = vshlq_s32( vsuma, invshift );
1136+
1137+
int16x4_t vsum = pack_sum_s32_to_s16x4<isLast>( vsuma, vibdimax );
1138+
1139+
store_s16x2x2( &dst[0], vsum, dstStride );
1140+
1141+
src += 2 * srcStride;
1142+
dst += 2 * dstStride;
1143+
}
1144+
if( row != height )
1145+
{
1146+
int16x4_t vsrc0 = vld1_s16( &src[0] );
1147+
int16x4_t vsrc1 = vld1_s16( &src[1] );
1148+
1149+
int16x8_t vsrc01 = vcombine_s16( vsrc0, vsrc1 );
1150+
int64x2_t vsum01 = vvenc_sdotq_s16( vdupq_n_s64( 0 ), vsrc01, vcoeffx2 );
1151+
1152+
int32x4_t vsuma = vcombine_s32( vmovn_s64( vsum01 ), vdup_n_s32( 0 ) );
1153+
vsuma = vaddq_s32( vsuma, voffset );
1154+
vsuma = vshlq_s32( vsuma, invshift );
1155+
1156+
int16x4_t vsum = pack_sum_s32_to_s16x4<isLast>( vsuma, vibdimax );
1157+
1158+
store_s16x2( &dst[0], vsum );
1159+
}
1160+
}
1161+
1162+
template<int N, bool isFirst, bool isLast>
1163+
void simdFilterHor_sve( const ClpRng& clpRng, Pel const* src, int srcStride, Pel* dst, int dstStride, int width,
1164+
int height, TFilterCoeff const* coeff )
1165+
{
1166+
static_assert( N == 4 || N == 6 || N == 8, "Supported taps: 4/6/8" );
1167+
CHECKD( height < 1, "Height must be >= 1!" );
1168+
CHECKD( width < 1, "Width must be >= 1!" );
1169+
CHECKD( clpRng.bd > 10, "VVenC does not support bitdepths larger than 10!" );
1170+
CHECKD( IF_INTERNAL_PREC - clpRng.bd < 2, "Bit depth headroom must be at least 2" );
1171+
1172+
const int16_t* c = coeff;
1173+
1174+
const int cStride = 1; // Horizontal mode.
1175+
src -= ( N / 2 - 1 ) * cStride;
1176+
1177+
int offset;
1178+
int headRoom = IF_INTERNAL_PREC - clpRng.bd;
1179+
int shift = IF_FILTER_PREC;
1180+
1181+
// Set shift and offset for N != 2 case.
1182+
if( isLast )
1183+
{
1184+
shift += isFirst ? 0 : headRoom;
1185+
offset = 1 << ( shift - 1 );
1186+
offset += isFirst ? 0 : IF_INTERNAL_OFFS << IF_FILTER_PREC;
1187+
}
1188+
else
1189+
{
1190+
shift -= isFirst ? headRoom : 0;
1191+
offset = isFirst ? -IF_INTERNAL_OFFS * ( 1 << shift ) : 0;
1192+
}
1193+
1194+
if( N == 6 )
1195+
{
1196+
CHECKD( width % 4 != 0 && width != 1, "N6 width must be 1 or multiple of 4! width=" << width );
1197+
CHECKD( coeff[0] != 0 || coeff[7] != 0, "0th and 7th coeff must be zero for 6-tap!" );
1198+
1199+
if( width % 4 == 0 )
1200+
{
1201+
src -= 1; // Use 8-tap filter, but offset src by -1 since coeff[0] is always zero.
1202+
simdInterpolateHor_N8_sve<isLast>( src, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c );
1203+
}
1204+
else // width == 1
1205+
{
1206+
c = coeff + 1;
1207+
goto scalar_hor_m1;
1208+
}
1209+
1210+
return;
1211+
}
1212+
1213+
if( N == 8 )
1214+
{
1215+
CHECKD( width % 4 != 0 && width != 1, "N8 width must be 1 or multiple of 4! width=" << width );
1216+
1217+
if( width % 4 == 0 )
1218+
{
1219+
simdInterpolateHor_N8_sve<isLast>( src, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c );
1220+
}
1221+
else // width == 1
1222+
{
1223+
goto scalar_hor_m1;
1224+
}
1225+
1226+
return;
1227+
}
1228+
1229+
if( N == 4 )
1230+
{
1231+
CHECKD( width % 4 != 0 && width != 2 && width != 1, "N4 width must be 1 or 2 or multiple of 4! width=" << width );
1232+
1233+
if( width % 4 == 0 )
1234+
{
1235+
simdInterpolateHor_N4_sve<isLast>( src, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c );
1236+
}
1237+
else if( width == 2 )
1238+
{
1239+
simdInterpolateHorM2_N4_sve<isLast>( src, srcStride, dst, dstStride, width, height, shift, offset, clpRng, c );
1240+
}
1241+
else // width == 1
1242+
{
1243+
goto scalar_hor_m1;
1244+
}
1245+
1246+
return;
1247+
}
1248+
1249+
scalar_hor_m1: // Use scalar code for width == 1.
1250+
CHECKD( width != 1, "Width must be 1!" );
1251+
do
1252+
{
1253+
int sum = src[0] * c[0];
1254+
sum += src[1] * c[1];
1255+
sum += src[2] * c[2];
1256+
sum += src[3] * c[3];
1257+
if( N >= 6 )
1258+
{
1259+
sum += src[4] * c[4];
1260+
sum += src[5] * c[5];
1261+
}
1262+
if( N == 8 )
1263+
{
1264+
sum += src[6] * c[6];
1265+
sum += src[7] * c[7];
1266+
}
1267+
1268+
Pel val = ( sum + offset ) >> shift;
1269+
if( isLast )
1270+
{
1271+
val = ClipPel( val, clpRng );
1272+
}
1273+
dst[0] = val;
1274+
1275+
src += srcStride;
1276+
dst += dstStride;
1277+
} while( --height != 0 );
1278+
}
1279+
9541280
template<>
9551281
void InterpolationFilter::_initInterpolationFilterARM<SVE>()
9561282
{
@@ -968,6 +1294,22 @@ void InterpolationFilter::_initInterpolationFilterARM<SVE>()
9681294
m_filter16xH[0][1] = simdFilter16xH_N8_sve<true>;
9691295
m_filter16xH[1][0] = simdFilter16xH_N4_sve<false>;
9701296
m_filter16xH[1][1] = simdFilter16xH_N4_sve<true>;
1297+
1298+
// SVE is only beneficial for the horizontal filtering of the 4-tap, 6-tap, and 8-tap filters.
1299+
m_filterHor[0][0][0] = simdFilterHor_sve<8, false, false>;
1300+
m_filterHor[0][0][1] = simdFilterHor_sve<8, false, true>;
1301+
m_filterHor[0][1][0] = simdFilterHor_sve<8, true, false>;
1302+
m_filterHor[0][1][1] = simdFilterHor_sve<8, true, true>;
1303+
1304+
m_filterHor[1][0][0] = simdFilterHor_sve<4, false, false>;
1305+
m_filterHor[1][0][1] = simdFilterHor_sve<4, false, true>;
1306+
m_filterHor[1][1][0] = simdFilterHor_sve<4, true, false>;
1307+
m_filterHor[1][1][1] = simdFilterHor_sve<4, true, true>;
1308+
1309+
m_filterHor[3][0][0] = simdFilterHor_sve<6, false, false>;
1310+
m_filterHor[3][0][1] = simdFilterHor_sve<6, false, true>;
1311+
m_filterHor[3][1][0] = simdFilterHor_sve<6, true, false>;
1312+
m_filterHor[3][1][1] = simdFilterHor_sve<6, true, true>;
9711313
}
9721314

9731315
} // namespace vvenc

0 commit comments

Comments
 (0)