@@ -154,7 +154,10 @@ void gradFilterCore(const Pel* pSrc, int srcStride, int width, int height, int g
154154 }
155155}
156156
157- void calcBDOFSumsCore (const Pel* srcY0Tmp, const Pel* srcY1Tmp, Pel* gradX0, Pel* gradX1, Pel* gradY0, Pel* gradY1, int xu, int yu, const int src0Stride, const int src1Stride, const int widthG, const int bitDepth, int * sumAbsGX, int * sumAbsGY, int * sumDIX, int * sumDIY, int * sumSignGY_GX)
157+ void calcBDOFSumsCore ( const Pel* srcY0Tmp, const Pel* srcY1Tmp, const Pel* gradX0, const Pel* gradX1,
158+ const Pel* gradY0, const Pel* gradY1, int xu, int yu, const int src0Stride, const int src1Stride,
159+ const int widthG, const int bitDepth, int * sumAbsGX, int * sumAbsGY, int * sumDIX, int * sumDIY,
160+ int * sumSignGY_GX )
158161{
159162 int shift4 = 4 ;
160163 int shift5 = 1 ;
@@ -596,6 +599,68 @@ void InterPrediction::xSubPuMC(CodingUnit& cu, PelUnitBuf& predBuf, const RefPic
596599 cu.affine = isAffine;
597600}
598601
602+ static inline int xRightShiftMSB ( int numer, int denom )
603+ {
604+ return numer >> floorLog2 ( denom );
605+ }
606+
607+ void xFpBiDirOptFlowCore ( const Pel* srcY0, const Pel* srcY1, const Pel* gradX0, const Pel* gradX1, const Pel* gradY0,
608+ const Pel* gradY1, const int width, const int height, Pel* dstY, const ptrdiff_t dstStride,
609+ const int shiftNum, const int offset, const int limit, const ClpRng& clpRng,
610+ const int bitDepth )
611+ {
612+ int xUnit = width >> 2 ;
613+ int yUnit = height >> 2 ;
614+ int heightG = height + 2 * BDOF_EXTEND_SIZE;
615+ int widthG = width + 2 * BDOF_EXTEND_SIZE;
616+
617+ int offsetPos = widthG * BDOF_EXTEND_SIZE + BDOF_EXTEND_SIZE;
618+ int stridePredMC = widthG + 2 ;
619+
620+ const int src0Stride = stridePredMC;
621+ const int src1Stride = stridePredMC;
622+
623+ const Pel* srcY0Temp = srcY0;
624+ const Pel* srcY1Temp = srcY1;
625+
626+ for ( int yu = 0 ; yu < yUnit; yu++ )
627+ {
628+ for ( int xu = 0 ; xu < xUnit; xu++ )
629+ {
630+ int tmpx = 0 , tmpy = 0 ;
631+ int sumAbsGX = 0 , sumAbsGY = 0 , sumDIX = 0 , sumDIY = 0 ;
632+ int sumSignGY_GX = 0 ;
633+
634+ const Pel* pGradX0Tmp = gradX0 + ( xu << 2 ) + ( yu << 2 ) * widthG;
635+ const Pel* pGradX1Tmp = gradX1 + ( xu << 2 ) + ( yu << 2 ) * widthG;
636+ const Pel* pGradY0Tmp = gradY0 + ( xu << 2 ) + ( yu << 2 ) * widthG;
637+ const Pel* pGradY1Tmp = gradY1 + ( xu << 2 ) + ( yu << 2 ) * widthG;
638+ const Pel* SrcY1Tmp = srcY1 + ( xu << 2 ) + ( yu << 2 ) * src1Stride;
639+ const Pel* SrcY0Tmp = srcY0 + ( xu << 2 ) + ( yu << 2 ) * src0Stride;
640+
641+ calcBDOFSumsCore ( SrcY0Tmp, SrcY1Tmp, pGradX0Tmp, pGradX1Tmp, pGradY0Tmp, pGradY1Tmp, xu, yu, src0Stride,
642+ src1Stride, widthG, bitDepth, &sumAbsGX, &sumAbsGY, &sumDIX, &sumDIY, &sumSignGY_GX );
643+ tmpx = ( sumAbsGX == 0 ? 0 : xRightShiftMSB ( 4 * sumDIX, sumAbsGX ) );
644+ tmpx = Clip3 ( -limit, limit, tmpx );
645+
646+ const int tmpData = sumSignGY_GX * tmpx >> 1 ;
647+ tmpy = ( sumAbsGY == 0 ? 0 : xRightShiftMSB ( ( 4 * sumDIY - tmpData ), sumAbsGY ) );
648+ tmpy = Clip3 ( -limit, limit, tmpy );
649+
650+ srcY0Temp = srcY0 + ( stridePredMC + 1 ) + ( ( yu * src0Stride + xu ) << 2 );
651+ srcY1Temp = srcY1 + ( stridePredMC + 1 ) + ( ( yu * src0Stride + xu ) << 2 );
652+ pGradX0Tmp = gradX0 + offsetPos + ( ( yu * widthG + xu ) << 2 );
653+ pGradX1Tmp = gradX1 + offsetPos + ( ( yu * widthG + xu ) << 2 );
654+ pGradY0Tmp = gradY0 + offsetPos + ( ( yu * widthG + xu ) << 2 );
655+ pGradY1Tmp = gradY1 + offsetPos + ( ( yu * widthG + xu ) << 2 );
656+
657+ Pel* dstY0 = dstY + ( ( yu * dstStride + xu ) << 2 );
658+ addBDOFAvgCore ( srcY0Temp, src0Stride, srcY1Temp, src1Stride, dstY0, dstStride, pGradX0Tmp, pGradX1Tmp,
659+ pGradY0Tmp, pGradY1Tmp, widthG, ( 1 << 2 ), ( 1 << 2 ), tmpx, tmpy, shiftNum, offset, clpRng );
660+ } // xu
661+ } // yu
662+ }
663+
599664InterPredInterpolation::InterPredInterpolation ()
600665 : m_storedMv(nullptr )
601666 , m_skipPROF(false )
@@ -639,7 +704,7 @@ void InterPredInterpolation::destroy()
639704 }
640705}
641706
642- void InterPredInterpolation::init ()
707+ void InterPredInterpolation::init ( bool enableOpt )
643708{
644709 for ( uint32_t c = 0 ; c < MAX_NUM_COMP; c++ )
645710 {
@@ -672,18 +737,21 @@ void InterPredInterpolation::init()
672737
673738 m_if.initInterpolationFilter ( true );
674739
740+ xFpBiDirOptFlow = xFpBiDirOptFlowCore;
675741 xFpBDOFGradFilter = gradFilterCore;
676742 xFpProfGradFilter = gradFilterCore<false >;
677743 xFpApplyPROF = applyPROFCore;
678744 xFpPadDmvr = padDmvrCore;
679745
746+ if ( enableOpt )
747+ {
680748#if ENABLE_SIMD_OPT_BDOF && defined( TARGET_SIMD_X86 )
681- initInterPredictionX86 ();
749+ initInterPredictionX86 ();
682750#endif
683-
684751#if ENABLE_SIMD_OPT_BDOF && defined( TARGET_SIMD_ARM )
685- initInterPredictionARM ();
752+ initInterPredictionARM ();
686753#endif
754+ }
687755
688756 if (m_storedMv == nullptr )
689757 {
@@ -841,11 +909,6 @@ void InterPredInterpolation::xPredInterBlk( const ComponentID compID, const Codi
841909 }
842910}
843911
844- int InterPredInterpolation::xRightShiftMSB ( int numer, int denom )
845- {
846- return ( numer >> floorLog2 ( denom ) );
847- }
848-
849912void InterPredInterpolation::xApplyBDOF ( PelBuf& yuvDst, const ClpRng& clpRng )
850913{
851914 const int bitDepth = clpRng.bd ;
@@ -868,9 +931,7 @@ void InterPredInterpolation::xApplyBDOF( PelBuf& yuvDst, const ClpRng& clpRng )
868931 const int src1Stride = stridePredMC;
869932
870933 Pel* dstY = yuvDst.buf ;
871- const int dstStride = yuvDst.stride ;
872- const Pel* srcY0Temp = srcY0;
873- const Pel* srcY1Temp = srcY1;
934+ const int dstStride = yuvDst.stride ;
874935
875936 for (int refList = 0 ; refList < NUM_REF_PIC_LIST_01; refList++)
876937 {
@@ -896,53 +957,8 @@ void InterPredInterpolation::xApplyBDOF( PelBuf& yuvDst, const ClpRng& clpRng )
896957 const int offset = (1 << (shiftNum - 1 )) + 2 * IF_INTERNAL_OFFS;
897958 const int limit = (1 << 4 ) - 1 ;
898959
899- if ( xFpBiDirOptFlow )
900- {
901- xFpBiDirOptFlow ( srcY0, srcY1, gradX0, gradX1, gradY0, gradY1, width, height, dstY, dstStride, shiftNum, offset, limit, clpRng, bitDepth );
902- return ;
903- }
904-
905- int xUnit = (width >> 2 );
906- int yUnit = (height >> 2 );
907-
908- Pel* dstY0 = dstY;
909- gradX0 = m_gradX0; gradX1 = m_gradX1;
910- gradY0 = m_gradY0; gradY1 = m_gradY1;
911-
912- for (int yu = 0 ; yu < yUnit; yu++)
913- {
914- for (int xu = 0 ; xu < xUnit; xu++)
915- {
916- int tmpx = 0 , tmpy = 0 ;
917- int sumAbsGX = 0 , sumAbsGY = 0 , sumDIX = 0 , sumDIY = 0 ;
918- int sumSignGY_GX = 0 ;
919-
920- Pel* pGradX0Tmp = m_gradX0 + (xu << 2 ) + (yu << 2 ) * widthG;
921- Pel* pGradX1Tmp = m_gradX1 + (xu << 2 ) + (yu << 2 ) * widthG;
922- Pel* pGradY0Tmp = m_gradY0 + (xu << 2 ) + (yu << 2 ) * widthG;
923- Pel* pGradY1Tmp = m_gradY1 + (xu << 2 ) + (yu << 2 ) * widthG;
924- const Pel* SrcY1Tmp = srcY1 + (xu << 2 ) + (yu << 2 ) * src1Stride;
925- const Pel* SrcY0Tmp = srcY0 + (xu << 2 ) + (yu << 2 ) * src0Stride;
926-
927- calcBDOFSumsCore (SrcY0Tmp, SrcY1Tmp, pGradX0Tmp, pGradX1Tmp, pGradY0Tmp, pGradY1Tmp, xu, yu, src0Stride, src1Stride, widthG, bitDepth, &sumAbsGX, &sumAbsGY, &sumDIX, &sumDIY, &sumSignGY_GX);
928- tmpx = (sumAbsGX == 0 ? 0 : xRightShiftMSB (4 * sumDIX, sumAbsGX));
929- tmpx = Clip3 (-limit, limit, tmpx);
930-
931- const int tmpData = sumSignGY_GX * tmpx >> 1 ;
932- tmpy = (sumAbsGY == 0 ? 0 : xRightShiftMSB ((4 * sumDIY - tmpData), sumAbsGY));
933- tmpy = Clip3 (-limit, limit, tmpy);
934-
935- srcY0Temp = srcY0 + (stridePredMC + 1 ) + ((yu*src0Stride + xu) << 2 );
936- srcY1Temp = srcY1 + (stridePredMC + 1 ) + ((yu*src0Stride + xu) << 2 );
937- gradX0 = m_gradX0 + offsetPos + ((yu*widthG + xu) << 2 );
938- gradX1 = m_gradX1 + offsetPos + ((yu*widthG + xu) << 2 );
939- gradY0 = m_gradY0 + offsetPos + ((yu*widthG + xu) << 2 );
940- gradY1 = m_gradY1 + offsetPos + ((yu*widthG + xu) << 2 );
941-
942- dstY0 = dstY + ((yu*dstStride + xu) << 2 );
943- addBDOFAvgCore (srcY0Temp, src0Stride, srcY1Temp, src1Stride, dstY0, dstStride, gradX0, gradX1, gradY0, gradY1, widthG, (1 << 2 ), (1 << 2 ), tmpx, tmpy, shiftNum, offset, clpRng);
944- } // xu
945- } // yu
960+ xFpBiDirOptFlow ( srcY0, srcY1, gradX0, gradX1, gradY0, gradY1, width, height, dstY, dstStride, shiftNum, offset,
961+ limit, clpRng, bitDepth );
946962}
947963
948964void InterPredInterpolation::xWeightedAverage ( const CodingUnit& cu, const CPelUnitBuf& pcYuvSrc0, const CPelUnitBuf& pcYuvSrc1, PelUnitBuf& pcYuvDst, const bool bdofApplied, PelUnitBuf *yuvPredTmp )
0 commit comments