Skip to content

Commit 36652e4

Browse files
authored
Preparing v1.10.0-rc1: first IDR at POC31, adaptive MCTF, ~0.5% BDR gains, ... (#336)
1 parent 6d27696 commit 36652e4

File tree

17 files changed

+126
-103
lines changed

17 files changed

+126
-103
lines changed

CMakeLists.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,10 @@ if( NOT CMAKE_VERSION VERSION_LESS 3.13.0 )
1010
endif()
1111

1212
# project name
13-
project( vvenc VERSION 1.9.1 )
13+
project( vvenc VERSION 1.10.0 )
1414

1515
# set alternative version numbering for release candidates
16-
#set( PROJECT_VERSION_RC rc1 )
16+
set( PROJECT_VERSION_RC rc1 )
1717
if( PROJECT_VERSION_RC )
1818
set( PROJECT_VERSION "${PROJECT_VERSION}-${PROJECT_VERSION_RC}" )
1919
endif()

cfg/experimental/lowdelay_faster.cfg

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -144,4 +144,4 @@ ReduceIntraChromaModesFullRD : 1 # Reduce modes for chroma full RD intra s
144144
FastTTSplit : 0 # Fast method for TT split
145145
ReduceFilterME : 2 # Use reduced filter taps for subpel motion estimation (2: 4-tap, 1: 6-tap)
146146
SelectiveRDOQ : 2 # Only use RDOQ when there are non-zero unquantized coefficients (0: never, 1: always, 2: for natural content)
147-
FirstPassMode : 2 # FirstPassMode (0: default, 1: faster, 2: faster with temporal downsampling)
147+
FirstPassMode : 4 # FirstPassMode (0: default, 1: faster, 2: faster with temporal downsampling)

cfg/randomaccess_faster.cfg

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -132,4 +132,4 @@ ReduceIntraChromaModesFullRD : 1 # Reduce modes for chroma full RD intra s
132132
FastTTSplit : 0 # Fast method for TT split
133133
ReduceFilterME : 2 # Use reduced filter taps for subpel motion estimation (2: 4-tap, 1: 6-tap)
134134
SelectiveRDOQ : 2 # Only use RDOQ when there are non-zero unquantized coefficients (0: never, 1: always, 2: for natural content)
135-
FirstPassMode : 2 # FirstPassMode (0: default, 1: faster, 2: faster with temporal downsampling)
135+
FirstPassMode : 4 # FirstPassMode (0: default, 1: faster, 2: faster with temporal downsampling)

changelog.txt

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,23 @@
1+
/////////////////////////////////////////
2+
tag 1.10.0-rc1
3+
* libvvenc:
4+
- added library parameters:
5+
- vvenc_config::m_poc0idr to force POC0 to be an IDR (otherwise per default it will now be a RASL picture)
6+
- changed default GOP structure of the first GOP to align with other GOPs, ensuring all intra-periods
7+
and DASH segments have equal length
8+
- enable spatial subsampling for 1st pass in 2-pass RC per default for faster preset
9+
- adaptively extend the number of neighboring frames in MCTF pre-filtering
10+
- allow ALF derivation from partial data for FPPLinesSynchro
11+
- improved rate matching accuracy at high target rates on easy-to-encode content, especially for HDR
12+
- disable compilation for SSE42 and AVX as no specific code is used,
13+
only keep explicit support for SSE41 and AVX2
14+
- around 0.5% BDR gain for all presets
15+
- minor changes and cleanups to rate control, DMVR, MCTF, QPA and others
16+
17+
* vvencFFapp:
18+
- added parameter: POC0IDR to control if POC0 is an IDR (=1, default if PicReordering is 1)
19+
or RASL (=0, default if PicReordering is 0)
20+
121
/////////////////////////////////////////
222
tag 1.9.1
323
* libvvenc:

source/Lib/CommonLib/MCTF.cpp

Lines changed: 14 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -416,7 +416,7 @@ void applyPlanarCorrectionCore( const Pel* refPel, const ptrdiff_t refStride, Pe
416416
}
417417
}
418418

419-
void applyBlockCore( const CPelBuf& src, PelBuf& dst, const CompArea& blk, const ClpRng& clpRng, const Pel** correctedPics, int numRefs, const int* verror, const double refStrenghts[4], double weightScaling, double sigmaSq )
419+
void applyBlockCore( const CPelBuf& src, PelBuf& dst, const CompArea& blk, const ClpRng& clpRng, const Pel** correctedPics, int numRefs, const int* verror, const double* refStrenghts, double weightScaling, double sigmaSq )
420420
{
421421
const int w = blk.width;
422422
const int h = blk.height;
@@ -586,7 +586,7 @@ void MCTF::init( const VVEncCfg& encCfg, bool isFinalPass, NoMallocThreadPool* t
586586
m_area = Area( 0, 0, m_encCfg->m_PadSourceWidth, m_encCfg->m_PadSourceHeight );
587587

588588
// TLayer (TL) dependent definition of drop frames: TL = 4, TL = 3, TL = 2, TL = 1, TL = 0
589-
const static int sMCTFSpeed[5] { 0, 0, ((3<<12) + (2<<9) + (2<<6) + (0<<3) + 0), ((3<<12) + (2<<9) + (2<<6) + (0<<3) + 0), ((3<<12) + (3<<9) + (2<<6) + (2<<3) + 2) };
589+
const static int sMCTFSpeed[5] { 0, 0, ((3<<12) + (2<<9) + (2<<6) + (0<<3) + 0), ((3<<12) + (2<<9) + (2<<6) + (0<<3) + 0), ((3<<12) + (3<<9) + (3<<6) + (2<<3) + 2) };
590590

591591
m_MCTFSpeedVal = sMCTFSpeed[ m_encCfg->m_vvencMCTF.MCTFSpeed ];
592592
m_lowResFltSearch = m_encCfg->m_vvencMCTF.MCTFSpeed > 0;
@@ -1247,17 +1247,20 @@ bool MCTF::estimateLumaLn( std::atomic_int& blockX_, std::atomic_int* prevLineX,
12471247
}
12481248
}
12491249

1250-
const int w = std::min<int>( blockSize, orig.Y().width - blockX ) & ~7;
1251-
const int h = std::min<int>( blockSize, orig.Y().height - blockY ) & ~7;
1250+
if( doubleRes )
1251+
{
1252+
const int w = std::min<int>( blockSize, orig.Y().width - blockX ) & ~7;
1253+
const int h = std::min<int>( blockSize, orig.Y().height - blockY ) & ~7;
12521254

1253-
CHECKD(bitDepth>10, "unsupported internal bit depth (also in calcVar)" );
1254-
const double bdScale = double(1<<(2*(10-bitDepth)));
1255-
const double dvar = m_calcVar( orig.Y().bufAt( blockX, blockY ), orig.Y().stride, w, h ) * bdScale;
1256-
const double mse = best.error * bdScale / double( w * h );
1255+
CHECKD(bitDepth>10, "unsupported internal bit depth (also in calcVar)" );
1256+
const double bdScale = double(1<<(2*(10-bitDepth)));
1257+
const double dvar = m_calcVar( orig.Y().bufAt( blockX, blockY ), orig.Y().stride, w, h ) * bdScale;
1258+
const double mse = best.error * bdScale / double( w * h );
12571259

1258-
best.error = ( int ) ( 20 * ( ( best.error*bdScale + 5.0 ) / ( dvar + 5.0 ) ) + mse / 50.0 );
1259-
best.rmsme = uint16_t( 0.5 + sqrt( mse ) );
1260-
best.overlap = ( ( double ) w * h ) / ( m_mctfUnitSize * m_mctfUnitSize );
1260+
best.error = ( int ) ( 20 * ( ( best.error*bdScale + 5.0 ) / ( dvar + 5.0 ) ) + mse / 50.0 );
1261+
best.rmsme = uint16_t( 0.5 + sqrt( mse ) );
1262+
best.overlap = ( ( double ) w * h ) / ( m_mctfUnitSize * m_mctfUnitSize );
1263+
}
12611264

12621265
mvs.get(blockX / stepSize, blockY / stepSize) = best;
12631266
}

source/Lib/CommonLib/x86/AdaptiveLoopFilterX86.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1151,9 +1151,9 @@ void simdFilterBlkCcAlf( const PelBuf &dstBuf, const CPelUnitBuf &recSrc, const
11511151
if( getChannelTypeScaleX( CH_C, nChromaFormat ) == 1 )
11521152
{
11531153
__m128i xfilterCoeff[4];
1154-
xfilterCoeff[0] = _mm_set1_epi32( ( filterCoeff[1] & 0xffff ) | ( filterCoeff[2] << 16 ) );
1155-
xfilterCoeff[1] = _mm_set1_epi32( ( filterCoeff[0] & 0xffff ) | ( filterCoeff[3] << 16 ) );
1156-
xfilterCoeff[2] = _mm_set1_epi32( ( filterCoeff[4] & 0xffff ) | ( filterCoeff[5] << 16 ) );
1154+
xfilterCoeff[0] = _mm_set1_epi32( ( filterCoeff[1] & 0xffff ) | ( filterCoeff[2] * (1<< 16 )));
1155+
xfilterCoeff[1] = _mm_set1_epi32( ( filterCoeff[0] & 0xffff ) | ( filterCoeff[3] * (1<< 16 )));
1156+
xfilterCoeff[2] = _mm_set1_epi32( ( filterCoeff[4] & 0xffff ) | ( filterCoeff[5] * (1<< 16 )));
11571157
xfilterCoeff[3] = _mm_set1_epi32( ( filterCoeff[6] & 0xffff ) );
11581158

11591159
for( int i = 0; i < endHeight - startHeight; i += clsSizeY )

source/Lib/CommonLib/x86/BufferX86.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2374,8 +2374,8 @@ uint64_t AvgHighPassWithDownsamplingDiff1st_SIMD (const int width, const int hei
23742374
pSrc -= iSrcStride;
23752375
pSrcM1-=iSrcM1Stride;
23762376
pSrcM1-=iSrcM1Stride;
2377-
uint32_t x;
2378-
uint32_t y;
2377+
int32_t x;
2378+
int32_t y;
23792379
const __m128i scale1 = _mm_set_epi16 (1,1,1,1,1,1,1,1);
23802380
for (y = 2; y < height-2; y += 2)
23812381
{
@@ -2456,8 +2456,8 @@ uint64_t AvgHighPassWithDownsamplingDiff2nd_SIMD (const int width,const int heig
24562456
{
24572457
uint64_t taAct = 0;
24582458
uint16_t act = 0;
2459-
uint32_t y;
2460-
uint32_t x;
2459+
int32_t y;
2460+
int32_t x;
24612461
pSrc -= iSrcStride;
24622462
pSrc -= iSrcStride;
24632463
pSrcM1-=iSM1Stride;

source/Lib/CommonLib/x86/InitX86.cpp

Lines changed: 36 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -79,8 +79,8 @@ void InterpolationFilter::initInterpolationFilterX86( /*int iBitDepthY, int iBit
7979
_initInterpolationFilterX86<AVX2>(/*iBitDepthY, iBitDepthC*/);
8080
break;
8181
case AVX:
82-
_initInterpolationFilterX86<AVX>(/*iBitDepthY, iBitDepthC*/);
83-
break;
82+
//_initInterpolationFilterX86<AVX>(/*iBitDepthY, iBitDepthC*/);
83+
//break;
8484
case SSE42:
8585
case SSE41:
8686
_initInterpolationFilterX86<SSE41>(/*iBitDepthY, iBitDepthC*/);
@@ -106,8 +106,8 @@ void PelBufferOps::initPelBufOpsX86()
106106
_initPelBufOpsX86<AVX2>();
107107
break;
108108
case AVX:
109-
_initPelBufOpsX86<AVX>();
110-
break;
109+
//_initPelBufOpsX86<AVX>();
110+
//break;
111111
case SSE42:
112112
case SSE41:
113113
_initPelBufOpsX86<SSE41>();
@@ -130,8 +130,8 @@ void LoopFilter::initLoopFilterX86()
130130
_initLoopFilterX86<AVX2>();
131131
break;
132132
case AVX:
133-
_initLoopFilterX86<AVX>();
134-
break;
133+
//_initLoopFilterX86<AVX>();
134+
//break;
135135
case SSE42:
136136
case SSE41:
137137
_initLoopFilterX86<SSE41>();
@@ -156,8 +156,8 @@ void RdCost::initRdCostX86()
156156
break;
157157
#endif
158158
case AVX:
159-
_initRdCostX86<AVX>();
160-
break;
159+
//_initRdCostX86<AVX>();
160+
//break;
161161
case SSE42:
162162
case SSE41:
163163
_initRdCostX86<SSE41>();
@@ -179,8 +179,8 @@ void AdaptiveLoopFilter::initAdaptiveLoopFilterX86()
179179
_initAdaptiveLoopFilterX86<AVX2>();
180180
break;
181181
case AVX:
182-
_initAdaptiveLoopFilterX86<AVX>();
183-
break;
182+
//_initAdaptiveLoopFilterX86<AVX>();
183+
//break;
184184
case SSE42:
185185
case SSE41:
186186
_initAdaptiveLoopFilterX86<SSE41>();
@@ -201,8 +201,8 @@ void SampleAdaptiveOffset::initSampleAdaptiveOffsetX86()
201201
_initSampleAdaptiveOffsetX86<AVX2>();
202202
break;
203203
case AVX:
204-
_initSampleAdaptiveOffsetX86<AVX>();
205-
break;
204+
//_initSampleAdaptiveOffsetX86<AVX>();
205+
//break;
206206
case SSE42:
207207
case SSE41:
208208
_initSampleAdaptiveOffsetX86<SSE41>();
@@ -224,8 +224,8 @@ void InterPredInterpolation::initInterPredictionX86()
224224
_initInterPredictionX86<AVX2>();
225225
break;
226226
case AVX:
227-
_initInterPredictionX86<AVX>();
228-
break;
227+
//_initInterPredictionX86<AVX>();
228+
//break;
229229
case SSE42:
230230
case SSE41:
231231
_initInterPredictionX86<SSE41>();
@@ -246,8 +246,8 @@ void AffineGradientSearch::initAffineGradientSearchX86()
246246
_initAffineGradientSearchX86<AVX2>();
247247
break;
248248
case AVX:
249-
_initAffineGradientSearchX86<AVX>();
250-
break;
249+
//_initAffineGradientSearchX86<AVX>();
250+
//break;
251251
case SSE42:
252252
case SSE41:
253253
_initAffineGradientSearchX86<SSE41>();
@@ -268,8 +268,8 @@ void IntraPrediction::initIntraPredictionX86()
268268
_initIntraPredictionX86<AVX2>();
269269
break;
270270
case AVX:
271-
_initIntraPredictionX86<AVX>();
272-
break;
271+
//_initIntraPredictionX86<AVX>();
272+
//break;
273273
case SSE42:
274274
case SSE41:
275275
_initIntraPredictionX86<SSE41>();
@@ -290,11 +290,11 @@ void MCTF::initMCTF_X86()
290290
_initMCTF_X86<AVX2 >();
291291
break;
292292
case AVX:
293-
_initMCTF_X86<AVX >();
294-
break;
293+
//_initMCTF_X86<AVX >();
294+
//break;
295295
case SSE42:
296-
_initMCTF_X86<SSE42>();
297-
break;
296+
//_initMCTF_X86<SSE42>();
297+
//break;
298298
case SSE41:
299299
_initMCTF_X86<SSE41>();
300300
break;
@@ -314,11 +314,11 @@ void TCoeffOps::initTCoeffOpsX86()
314314
_initTCoeffOpsX86<AVX2 >();
315315
break;
316316
case AVX:
317-
_initTCoeffOpsX86<AVX >();
318-
break;
317+
//_initTCoeffOpsX86<AVX >();
318+
//break;
319319
case SSE42:
320-
_initTCoeffOpsX86<SSE42>();
321-
break;
320+
//_initTCoeffOpsX86<SSE42>();
321+
//break;
322322
case SSE41:
323323
_initTCoeffOpsX86<SSE41>();
324324
break;
@@ -334,17 +334,17 @@ void TrQuant::initTrQuantX86()
334334
{
335335
case AVX512:
336336
case AVX2:
337-
_initTrQuantX86<AVX2 >();
338-
break;
337+
_initTrQuantX86<AVX2 >();
338+
break;
339339
case AVX:
340-
_initTrQuantX86<AVX >();
341-
break;
340+
//_initTrQuantX86<AVX >();
341+
//break;
342342
case SSE42:
343-
_initTrQuantX86<SSE42>();
344-
break;
343+
//_initTrQuantX86<SSE42>();
344+
//break;
345345
case SSE41:
346-
_initTrQuantX86<SSE41>();
347-
break;
346+
_initTrQuantX86<SSE41>();
347+
break;
348348
default:
349349
break;
350350
}
@@ -363,8 +363,8 @@ void Quant::initQuantX86()
363363
_initQuantX86<AVX2>();
364364
break;
365365
case AVX:
366-
_initQuantX86<AVX>();
367-
break;
366+
//_initQuantX86<AVX>();
367+
//break;
368368
case SSE42:
369369
case SSE41:
370370
_initQuantX86<SSE41>();

source/Lib/EncoderLib/BitAllocation.cpp

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -299,7 +299,7 @@ static void clipQPValToEstimatedMinimStats (const uint8_t* minNoiseLevels, const
299299
return;
300300
}
301301

302-
i = std::max (0, apprI3Log2 (std::min (16.0, resFac) * i * i, false) + dQPOffset + extraQPOffset); // =6*log2
302+
i = std::max (0, apprI3Log2 (std::min (1.0, resFac) * i * i, false) + dQPOffset + extraQPOffset); // = 6*log2
303303
if (QP < i)
304304
{
305305
QP = i;
@@ -806,7 +806,8 @@ int BitAllocation::applyQPAdaptationSubCtu (const Slice* slice, const VVEncCfg*
806806
}
807807

808808
int BitAllocation::getCtuPumpingReducingQP (const Slice* slice, const CPelBuf& origY, const Distortion uiSadBestForQPA,
809-
std::vector<int>& ctuPumpRedQP, const uint32_t ctuRsAddr, const int baseQP)
809+
std::vector<int>& ctuPumpRedQP, const uint32_t ctuRsAddr, const int baseQP,
810+
const bool isBIM)
810811
{
811812
if (slice == nullptr || !slice->pps->useDQP || ctuPumpRedQP.size() <= ctuRsAddr) return 0;
812813

@@ -824,7 +825,7 @@ int BitAllocation::getCtuPumpingReducingQP (const Slice* slice, const CPelBuf& o
824825
}
825826

826827
const double sumAbsRatio = double (uiSadBestForQPA * 3 /*TODO: or 4? fine-tune!*/) / double (sumAbsZmOrig == 0 ? 1 : sumAbsZmOrig);
827-
const int pumpingReducQP = (int (log (Clip3 (0.25, 4.0, sumAbsRatio)) / log (2.0) + (sumAbsRatio < 1.0 ? -0.5 : 0.5))) >> (baseQP >= 38/*MAX_QP_PERCEPT_QPA*/ ? 1 : 0);
828+
const int pumpingReducQP = ((isBIM ? -1 : 0) + int (log (Clip3 (0.25, 4.0, sumAbsRatio)) / log (2.0) + (sumAbsRatio < 1.0 ? -0.5 : 0.5))) >> (baseQP >= 38/*MAX_QP_PERCEPT_QPA*/ ? 1 : 0);
828829

829830
ctuPumpRedQP[ctuRsAddr] += pumpingReducQP;
830831

source/Lib/EncoderLib/BitAllocation.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,8 @@ namespace vvenc {
9090
const Distortion uiSadBestForQPA,
9191
std::vector<int>& ctuPumpRedQP,
9292
const uint32_t ctuRsAddr,
93-
const int baseQP );
93+
const int baseQP,
94+
const bool isBIM );
9495
}
9596

9697
} // namespace vvenc

0 commit comments

Comments
 (0)