@@ -154,7 +154,10 @@ void gradFilterCore(const Pel* pSrc, int srcStride, int width, int height, int g
154154 }
155155}
156156
157- void calcBDOFSumsCore (const Pel* srcY0Tmp, const Pel* srcY1Tmp, Pel* gradX0, Pel* gradX1, Pel* gradY0, Pel* gradY1, int xu, int yu, const int src0Stride, const int src1Stride, const int widthG, const int bitDepth, int * sumAbsGX, int * sumAbsGY, int * sumDIX, int * sumDIY, int * sumSignGY_GX)
157+ void calcBDOFSumsCore ( const Pel* srcY0Tmp, const Pel* srcY1Tmp, const Pel* gradX0, const Pel* gradX1,
158+ const Pel* gradY0, const Pel* gradY1, int xu, int yu, const int src0Stride, const int src1Stride,
159+ const int widthG, const int bitDepth, int * sumAbsGX, int * sumAbsGY, int * sumDIX, int * sumDIY,
160+ int * sumSignGY_GX )
158161{
159162 int shift4 = 4 ;
160163 int shift5 = 1 ;
@@ -596,6 +599,67 @@ void InterPrediction::xSubPuMC(CodingUnit& cu, PelUnitBuf& predBuf, const RefPic
596599 cu.affine = isAffine;
597600}
598601
602+ static inline int xRightShiftMSB ( int numer, int denom )
603+ {
604+ return numer >> floorLog2 ( denom );
605+ }
606+
607+ void xFpBiDirOptFlowCore ( const Pel* srcY0, const Pel* srcY1, const Pel* gradX0, const Pel* gradX1, const Pel* gradY0,
608+ const Pel* gradY1, const int width, const int height, Pel* dstY, const ptrdiff_t dstStride,
609+ const int shiftNum, const int offset, const int limit, const ClpRng& clpRng,
610+ const int bitDepth )
611+ {
612+ int xUnit = width >> 2 ;
613+ int yUnit = height >> 2 ;
614+ int widthG = width + 2 * BDOF_EXTEND_SIZE;
615+
616+ int offsetPos = widthG * BDOF_EXTEND_SIZE + BDOF_EXTEND_SIZE;
617+ int stridePredMC = widthG + 2 ;
618+
619+ const int src0Stride = stridePredMC;
620+ const int src1Stride = stridePredMC;
621+
622+ const Pel* srcY0Temp = srcY0;
623+ const Pel* srcY1Temp = srcY1;
624+
625+ for ( int yu = 0 ; yu < yUnit; yu++ )
626+ {
627+ for ( int xu = 0 ; xu < xUnit; xu++ )
628+ {
629+ int tmpx = 0 , tmpy = 0 ;
630+ int sumAbsGX = 0 , sumAbsGY = 0 , sumDIX = 0 , sumDIY = 0 ;
631+ int sumSignGY_GX = 0 ;
632+
633+ const Pel* pGradX0Tmp = gradX0 + ( xu << 2 ) + ( yu << 2 ) * widthG;
634+ const Pel* pGradX1Tmp = gradX1 + ( xu << 2 ) + ( yu << 2 ) * widthG;
635+ const Pel* pGradY0Tmp = gradY0 + ( xu << 2 ) + ( yu << 2 ) * widthG;
636+ const Pel* pGradY1Tmp = gradY1 + ( xu << 2 ) + ( yu << 2 ) * widthG;
637+ const Pel* SrcY1Tmp = srcY1 + ( xu << 2 ) + ( yu << 2 ) * src1Stride;
638+ const Pel* SrcY0Tmp = srcY0 + ( xu << 2 ) + ( yu << 2 ) * src0Stride;
639+
640+ calcBDOFSumsCore ( SrcY0Tmp, SrcY1Tmp, pGradX0Tmp, pGradX1Tmp, pGradY0Tmp, pGradY1Tmp, xu, yu, src0Stride,
641+ src1Stride, widthG, bitDepth, &sumAbsGX, &sumAbsGY, &sumDIX, &sumDIY, &sumSignGY_GX );
642+ tmpx = ( sumAbsGX == 0 ? 0 : xRightShiftMSB ( 4 * sumDIX, sumAbsGX ) );
643+ tmpx = Clip3 ( -limit, limit, tmpx );
644+
645+ const int tmpData = sumSignGY_GX * tmpx >> 1 ;
646+ tmpy = ( sumAbsGY == 0 ? 0 : xRightShiftMSB ( ( 4 * sumDIY - tmpData ), sumAbsGY ) );
647+ tmpy = Clip3 ( -limit, limit, tmpy );
648+
649+ srcY0Temp = srcY0 + ( stridePredMC + 1 ) + ( ( yu * src0Stride + xu ) << 2 );
650+ srcY1Temp = srcY1 + ( stridePredMC + 1 ) + ( ( yu * src0Stride + xu ) << 2 );
651+ pGradX0Tmp = gradX0 + offsetPos + ( ( yu * widthG + xu ) << 2 );
652+ pGradX1Tmp = gradX1 + offsetPos + ( ( yu * widthG + xu ) << 2 );
653+ pGradY0Tmp = gradY0 + offsetPos + ( ( yu * widthG + xu ) << 2 );
654+ pGradY1Tmp = gradY1 + offsetPos + ( ( yu * widthG + xu ) << 2 );
655+
656+ Pel* dstY0 = dstY + ( ( yu * dstStride + xu ) << 2 );
657+ addBDOFAvgCore ( srcY0Temp, src0Stride, srcY1Temp, src1Stride, dstY0, dstStride, pGradX0Tmp, pGradX1Tmp,
658+ pGradY0Tmp, pGradY1Tmp, widthG, ( 1 << 2 ), ( 1 << 2 ), tmpx, tmpy, shiftNum, offset, clpRng );
659+ } // xu
660+ } // yu
661+ }
662+
599663InterPredInterpolation::InterPredInterpolation ()
600664 : m_storedMv(nullptr )
601665 , m_skipPROF(false )
@@ -639,7 +703,7 @@ void InterPredInterpolation::destroy()
639703 }
640704}
641705
642- void InterPredInterpolation::init ()
706+ void InterPredInterpolation::init ( bool enableOpt )
643707{
644708 for ( uint32_t c = 0 ; c < MAX_NUM_COMP; c++ )
645709 {
@@ -672,18 +736,21 @@ void InterPredInterpolation::init()
672736
673737 m_if.initInterpolationFilter ( true );
674738
739+ xFpBiDirOptFlow = xFpBiDirOptFlowCore;
675740 xFpBDOFGradFilter = gradFilterCore;
676741 xFpProfGradFilter = gradFilterCore<false >;
677742 xFpApplyPROF = applyPROFCore;
678743 xFpPadDmvr = padDmvrCore;
679744
745+ if ( enableOpt )
746+ {
680747#if ENABLE_SIMD_OPT_BDOF && defined( TARGET_SIMD_X86 )
681- initInterPredictionX86 ();
748+ initInterPredictionX86 ();
682749#endif
683-
684750#if ENABLE_SIMD_OPT_BDOF && defined( TARGET_SIMD_ARM )
685- initInterPredictionARM ();
751+ initInterPredictionARM ();
686752#endif
753+ }
687754
688755 if (m_storedMv == nullptr )
689756 {
@@ -841,11 +908,6 @@ void InterPredInterpolation::xPredInterBlk( const ComponentID compID, const Codi
841908 }
842909}
843910
844- int InterPredInterpolation::xRightShiftMSB ( int numer, int denom )
845- {
846- return ( numer >> floorLog2 ( denom ) );
847- }
848-
849911void InterPredInterpolation::xApplyBDOF ( PelBuf& yuvDst, const ClpRng& clpRng )
850912{
851913 const int bitDepth = clpRng.bd ;
@@ -854,7 +916,6 @@ void InterPredInterpolation::xApplyBDOF( PelBuf& yuvDst, const ClpRng& clpRng )
854916 const int width = yuvDst.width ;
855917 int heightG = height + 2 * BDOF_EXTEND_SIZE;
856918 int widthG = width + 2 * BDOF_EXTEND_SIZE;
857- int offsetPos = widthG*BDOF_EXTEND_SIZE + BDOF_EXTEND_SIZE;
858919
859920 Pel* gradX0 = m_gradX0;
860921 Pel* gradX1 = m_gradX1;
@@ -864,13 +925,9 @@ void InterPredInterpolation::xApplyBDOF( PelBuf& yuvDst, const ClpRng& clpRng )
864925 int stridePredMC = widthG + 2 ;
865926 const Pel* srcY0 = m_filteredBlockTmp[2 ][COMP_Y] + stridePredMC + 1 ;
866927 const Pel* srcY1 = m_filteredBlockTmp[3 ][COMP_Y] + stridePredMC + 1 ;
867- const int src0Stride = stridePredMC;
868- const int src1Stride = stridePredMC;
869928
870929 Pel* dstY = yuvDst.buf ;
871- const int dstStride = yuvDst.stride ;
872- const Pel* srcY0Temp = srcY0;
873- const Pel* srcY1Temp = srcY1;
930+ const int dstStride = yuvDst.stride ;
874931
875932 for (int refList = 0 ; refList < NUM_REF_PIC_LIST_01; refList++)
876933 {
@@ -896,53 +953,8 @@ void InterPredInterpolation::xApplyBDOF( PelBuf& yuvDst, const ClpRng& clpRng )
896953 const int offset = (1 << (shiftNum - 1 )) + 2 * IF_INTERNAL_OFFS;
897954 const int limit = (1 << 4 ) - 1 ;
898955
899- if ( xFpBiDirOptFlow )
900- {
901- xFpBiDirOptFlow ( srcY0, srcY1, gradX0, gradX1, gradY0, gradY1, width, height, dstY, dstStride, shiftNum, offset, limit, clpRng, bitDepth );
902- return ;
903- }
904-
905- int xUnit = (width >> 2 );
906- int yUnit = (height >> 2 );
907-
908- Pel* dstY0 = dstY;
909- gradX0 = m_gradX0; gradX1 = m_gradX1;
910- gradY0 = m_gradY0; gradY1 = m_gradY1;
911-
912- for (int yu = 0 ; yu < yUnit; yu++)
913- {
914- for (int xu = 0 ; xu < xUnit; xu++)
915- {
916- int tmpx = 0 , tmpy = 0 ;
917- int sumAbsGX = 0 , sumAbsGY = 0 , sumDIX = 0 , sumDIY = 0 ;
918- int sumSignGY_GX = 0 ;
919-
920- Pel* pGradX0Tmp = m_gradX0 + (xu << 2 ) + (yu << 2 ) * widthG;
921- Pel* pGradX1Tmp = m_gradX1 + (xu << 2 ) + (yu << 2 ) * widthG;
922- Pel* pGradY0Tmp = m_gradY0 + (xu << 2 ) + (yu << 2 ) * widthG;
923- Pel* pGradY1Tmp = m_gradY1 + (xu << 2 ) + (yu << 2 ) * widthG;
924- const Pel* SrcY1Tmp = srcY1 + (xu << 2 ) + (yu << 2 ) * src1Stride;
925- const Pel* SrcY0Tmp = srcY0 + (xu << 2 ) + (yu << 2 ) * src0Stride;
926-
927- calcBDOFSumsCore (SrcY0Tmp, SrcY1Tmp, pGradX0Tmp, pGradX1Tmp, pGradY0Tmp, pGradY1Tmp, xu, yu, src0Stride, src1Stride, widthG, bitDepth, &sumAbsGX, &sumAbsGY, &sumDIX, &sumDIY, &sumSignGY_GX);
928- tmpx = (sumAbsGX == 0 ? 0 : xRightShiftMSB (4 * sumDIX, sumAbsGX));
929- tmpx = Clip3 (-limit, limit, tmpx);
930-
931- const int tmpData = sumSignGY_GX * tmpx >> 1 ;
932- tmpy = (sumAbsGY == 0 ? 0 : xRightShiftMSB ((4 * sumDIY - tmpData), sumAbsGY));
933- tmpy = Clip3 (-limit, limit, tmpy);
934-
935- srcY0Temp = srcY0 + (stridePredMC + 1 ) + ((yu*src0Stride + xu) << 2 );
936- srcY1Temp = srcY1 + (stridePredMC + 1 ) + ((yu*src0Stride + xu) << 2 );
937- gradX0 = m_gradX0 + offsetPos + ((yu*widthG + xu) << 2 );
938- gradX1 = m_gradX1 + offsetPos + ((yu*widthG + xu) << 2 );
939- gradY0 = m_gradY0 + offsetPos + ((yu*widthG + xu) << 2 );
940- gradY1 = m_gradY1 + offsetPos + ((yu*widthG + xu) << 2 );
941-
942- dstY0 = dstY + ((yu*dstStride + xu) << 2 );
943- addBDOFAvgCore (srcY0Temp, src0Stride, srcY1Temp, src1Stride, dstY0, dstStride, gradX0, gradX1, gradY0, gradY1, widthG, (1 << 2 ), (1 << 2 ), tmpx, tmpy, shiftNum, offset, clpRng);
944- } // xu
945- } // yu
956+ xFpBiDirOptFlow ( srcY0, srcY1, gradX0, gradX1, gradY0, gradY1, width, height, dstY, dstStride, shiftNum, offset,
957+ limit, clpRng, bitDepth );
946958}
947959
948960void InterPredInterpolation::xWeightedAverage ( const CodingUnit& cu, const CPelUnitBuf& pcYuvSrc0, const CPelUnitBuf& pcYuvSrc1, PelUnitBuf& pcYuvDst, const bool bdofApplied, PelUnitBuf *yuvPredTmp )
0 commit comments