Skip to content

Commit 716c59d

Browse files
authored
Merge pull request #618 from micro-arm/interpolate-filter-neon
Arm: Add missing Neon implementations of simdFilter kernels
2 parents 9c70a1d + 430a995 commit 716c59d

File tree

3 files changed

+768
-237
lines changed

3 files changed

+768
-237
lines changed

source/Lib/CommonLib/arm/InterpolationFilter_neon.h

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,38 @@ POSSIBILITY OF SUCH DAMAGE.
5858
namespace vvenc
5959
{
6060

61+
template<bool isLast>
62+
static inline int16x4_t pack_sum_s32_to_s16x4( int32x4_t vsuma, int16x4_t vibdimax )
63+
{
64+
if( isLast )
65+
{
66+
// Narrow to u16 with saturation (clamp negatives to 0), then clamp at vibdimax.
67+
uint16x4_t usum = vqmovun_s32( vsuma );
68+
return vmin_s16( vibdimax, vreinterpret_s16_u16( usum ) );
69+
}
70+
else
71+
{
72+
// Narrow to s16 with saturation.
73+
return vqmovn_s32( vsuma );
74+
}
75+
}
76+
77+
template<bool isLast>
78+
static inline int16x8_t pack_sum_s32_to_s16x8( int32x4_t vsuma, int32x4_t vsumb, int16x8_t vibdimax )
79+
{
80+
if( isLast )
81+
{
82+
// Narrow to u16 with saturation (clamp negatives to 0), then clamp at vibdimax.
83+
uint16x8_t usum = vcombine_u16( vqmovun_s32( vsuma ), vqmovun_s32( vsumb ) );
84+
return vminq_s16( vibdimax, vreinterpretq_s16_u16( usum ) );
85+
}
86+
else
87+
{
88+
// Narrow to s16 with saturation.
89+
return vcombine_s16( vqmovn_s32( vsuma ), vqmovn_s32( vsumb ) );
90+
}
91+
}
92+
6193
static inline int32x4_t filter_vert_4x1_N6_neon( int16x4_t const vsrc[6], int16x8_t cv, int32x4_t voffset2 )
6294
{
6395
// For 6-tap, the 0th and 7th cv coefficients are zeros so remove them.

0 commit comments

Comments
 (0)