Merge pull request fraunhoferhhi#576 from micro-arm/cpycoeff-neon

adamjw24 · web-flow · commit 5c2b2cbb5b5c · 2025-07-03T02:21:56.000+02:00
Arm: Add Neon implementations of TCoeffOps::cpyCoeff kernels
diff --git a/source/Lib/CommonLib/arm/neon/Trafo_neon.cpp b/source/Lib/CommonLib/arm/neon/Trafo_neon.cpp
@@ -66,6 +66,55 @@ static inline int64_t shift_and_round( int64_t x, int shift )
   return ( x + ( static_cast<int64_t>(1) << ( shift - 1 ) ) ) >> shift;
 }
 
+template<int W>
+void cpyCoeff_neon( const Pel* src, ptrdiff_t srcStride, TCoeff* dst, unsigned width, unsigned height );
+
+template<>
+void cpyCoeff_neon<4>( const Pel* src, ptrdiff_t srcStride, TCoeff* dst, unsigned width, unsigned height )
+{
+  CHECK( height < 1, "Height must be >= 1" );
+  CHECK( width != 4, "Width must be 4" );
+
+  do
+  {
+    int16x4_t s = vld1_s16( src );
+    vst1q_s32( dst, vmovl_s16( s ) );
+
+    src += srcStride;
+    dst += width;
+  } while( --height != 0 );
+}
+
+template<>
+void cpyCoeff_neon<8>( const Pel* src, ptrdiff_t srcStride, TCoeff* dst, unsigned width, unsigned height )
+{
+  CHECK( height < 1, "Height must be >= 1" );
+  CHECK( width < 8 || width & 7, "Width must be >= 8 and a multiple of 8" );
+
+  do
+  {
+    unsigned w = 0;
+    for( ; w + 16 <= width; w += 16 )
+    {
+      int16x8_t s_lo = vld1q_s16( src + w + 0 );
+      int16x8_t s_hi = vld1q_s16( src + w + 8 );
+      vst1q_s32( dst + w + 0, vmovl_s16( vget_low_s16( s_lo ) ) );
+      vst1q_s32( dst + w + 4, vmovl_s16( vget_high_s16( s_lo ) ) );
+      vst1q_s32( dst + w + 8, vmovl_s16( vget_low_s16( s_hi ) ) );
+      vst1q_s32( dst + w + 12, vmovl_s16( vget_high_s16( s_hi ) ) );
+    }
+    if( width & 8 )
+    {
+      int16x8_t s = vld1q_s16( src + w );
+      vst1q_s32( dst + w + 0, vmovl_s16( vget_low_s16( s ) ) );
+      vst1q_s32( dst + w + 4, vmovl_s16( vget_high_s16( s ) ) );
+    }
+
+    src += srcStride;
+    dst += width;
+  } while( --height != 0 );
+}
+
 template<unsigned trSize>
 static void fastInvCore_neon( const TMatrixCoeff* it, const TCoeff* src, TCoeff* dst, unsigned lines,
                               unsigned reducedLines, unsigned rows )
@@ -264,6 +313,9 @@ static void fastFwdCore_neon( const TMatrixCoeff* tc, const TCoeff* src, TCoeff*
 template<>
 void TCoeffOps::_initTCoeffOpsARM<NEON>()
 {
+  cpyCoeff4 = cpyCoeff_neon<4>;
+  cpyCoeff8 = cpyCoeff_neon<8>;
+
   fastInvCore[ 0 ]    = fastInvCore_neon<4>;
   fastInvCore[ 1 ]    = fastInvCore_neon<8>;
   fastInvCore[ 2 ]    = fastInvCore_neon<16>;