@@ -5764,6 +5764,8 @@ template <class T, bool COMPUTE_OTHER_STATS> struct ComputeStatisticsInternal
57645764 }
57655765};
57665766
5767+ constexpr int ALIGNMENT_AVX2_OPTIM = 32 ;
5768+
57675769#if (defined(__x86_64__) || defined(_M_X64) || \
57685770 defined (USE_NEON_OPTIMIZATIONS)) && \
57695771 (defined (__GNUC__) || defined(_MSC_VER))
@@ -5781,23 +5783,28 @@ ComputeStatisticsByteNoNodata(GPtrDiff_t nBlockPixels,
57815783 GUIntBig &nSampleCount, GUIntBig &nValidCount)
57825784{
57835785 // 32-byte alignment may not be enforced by linker, so do it at hand
5784- GByte
5785- aby32ByteUnaligned[32 + 32 + 32 + (COMPUTE_OTHER_STATS ? 32 + 32 : 0 )];
5786+ GByte aby32ByteUnaligned[ALIGNMENT_AVX2_OPTIM + ALIGNMENT_AVX2_OPTIM +
5787+ ALIGNMENT_AVX2_OPTIM +
5788+ (COMPUTE_OTHER_STATS
5789+ ? ALIGNMENT_AVX2_OPTIM + ALIGNMENT_AVX2_OPTIM
5790+ : 0 )];
57865791 GByte *paby32ByteAligned =
57875792 aby32ByteUnaligned +
5788- (32 - (reinterpret_cast <GUIntptr_t>(aby32ByteUnaligned) % 32 ));
5793+ (ALIGNMENT_AVX2_OPTIM -
5794+ (reinterpret_cast <GUIntptr_t>(aby32ByteUnaligned) %
5795+ ALIGNMENT_AVX2_OPTIM));
57895796 GByte *pabyMin = paby32ByteAligned;
5790- GByte *pabyMax = paby32ByteAligned + 32 ;
5791- GUInt32 *panSum =
5792- COMPUTE_OTHER_STATS
5793- ? reinterpret_cast <GUInt32 *>( paby32ByteAligned + 32 * 2 )
5794- : nullptr ;
5797+ GByte *pabyMax = paby32ByteAligned + ALIGNMENT_AVX2_OPTIM ;
5798+ GUInt32 *panSum = COMPUTE_OTHER_STATS
5799+ ? reinterpret_cast <GUInt32 *>(
5800+ paby32ByteAligned + ALIGNMENT_AVX2_OPTIM * 2 )
5801+ : nullptr ;
57955802 GUInt32 *panSumSquare =
5796- COMPUTE_OTHER_STATS
5797- ? reinterpret_cast <GUInt32 *>( paby32ByteAligned + 32 * 3 )
5798- : nullptr ;
5803+ COMPUTE_OTHER_STATS ? reinterpret_cast <GUInt32 *>(
5804+ paby32ByteAligned + ALIGNMENT_AVX2_OPTIM * 3 )
5805+ : nullptr ;
57995806
5800- CPLAssert ((reinterpret_cast <uintptr_t >(pData) % 32 ) == 0 );
5807+ CPLAssert ((reinterpret_cast <uintptr_t >(pData) % ALIGNMENT_AVX2_OPTIM ) == 0 );
58015808
58025809 GPtrDiff_t i = 0 ;
58035810 // Make sure that sumSquare can fit on uint32
@@ -7368,25 +7375,115 @@ CPLErr GDALRasterBand::ComputeStatistics(int bApproxOK, double *pdfMin,
73687375 ? static_cast <GUInt32>(sNoDataValues .dfNoDataValue + 1e-10 )
73697376 : nMaxValueType + 1 ;
73707377
7378+ int nChunkXSize = nBlockXSize;
7379+ int nChunkYSize = nBlockYSize;
7380+ int nChunksPerRow = nBlocksPerRow;
7381+ int nChunksPerCol = nBlocksPerColumn;
7382+
7383+ int nThreads = 1 ;
7384+ if (nChunkYSize > 1 )
7385+ {
7386+ const char *pszNumThreads =
7387+ CPLGetConfigOption (" GDAL_NUM_THREADS" , nullptr );
7388+ if (pszNumThreads)
7389+ {
7390+ if (EQUAL (pszNumThreads, " ALL_CPUS" ))
7391+ nThreads = CPLGetNumCPUs ();
7392+ else
7393+ nThreads =
7394+ std::clamp (atoi (pszNumThreads), 1 , CPLGetNumCPUs ());
7395+ }
7396+ }
7397+
7398+ int nNewChunkXSize = nChunkXSize;
7399+ const int nDTSize = GDALGetDataTypeSizeBytes (eDataType);
7400+ if (!bApproxOK && nThreads > 1 &&
7401+ MayMultiBlockReadingBeMultiThreaded ())
7402+ {
7403+ const int64_t nRAMAmount = CPLGetUsablePhysicalRAM () / 10 ;
7404+ const size_t nChunkPixels =
7405+ static_cast <size_t >(nChunkXSize) * nChunkYSize;
7406+ if (nRAMAmount > 0 &&
7407+ nChunkPixels <=
7408+ std::numeric_limits<size_t >::max () / nDTSize)
7409+ {
7410+ const size_t nBlockSize = nDTSize * nChunkPixels;
7411+ const int64_t nBlockCount = nRAMAmount / nBlockSize;
7412+ if (nBlockCount >= 2 )
7413+ {
7414+ nNewChunkXSize = static_cast <int >(std::min<int64_t >(
7415+ nChunkXSize * std::min<int64_t >(
7416+ nBlockCount,
7417+ (std::numeric_limits<int >::max () -
7418+ ALIGNMENT_AVX2_OPTIM) /
7419+ nChunkPixels),
7420+ nRasterXSize));
7421+
7422+ CPLAssert (nChunkXSize <
7423+ std::numeric_limits<int >::max () /
7424+ nChunkYSize);
7425+ }
7426+ }
7427+ }
7428+
7429+ std::unique_ptr<GByte, VSIFreeReleaser> pabyTempUnaligned;
7430+ GByte *pabyTemp = nullptr ;
7431+ if (nNewChunkXSize != nBlockXSize)
7432+ {
7433+ pabyTempUnaligned.reset (static_cast <GByte *>(
7434+ VSIMalloc (nDTSize * nNewChunkXSize * nChunkYSize +
7435+ ALIGNMENT_AVX2_OPTIM)));
7436+ if (pabyTempUnaligned)
7437+ {
7438+ pabyTemp = reinterpret_cast <GByte *>(
7439+ reinterpret_cast <uintptr_t >(pabyTempUnaligned.get ()) +
7440+ (ALIGNMENT_AVX2_OPTIM -
7441+ (reinterpret_cast <uintptr_t >(pabyTempUnaligned.get ()) %
7442+ ALIGNMENT_AVX2_OPTIM)));
7443+ nChunkXSize = nNewChunkXSize;
7444+ nChunksPerRow =
7445+ cpl::div_round_up (nRasterXSize, nChunkXSize);
7446+ }
7447+ }
7448+
73717449 for (GIntBig iSampleBlock = 0 ;
73727450 iSampleBlock <
7373- static_cast <GIntBig>(nBlocksPerRow ) * nBlocksPerColumn ;
7451+ static_cast <GIntBig>(nChunksPerRow ) * nChunksPerCol ;
73747452 iSampleBlock += nSampleRate)
73757453 {
73767454 const int iYBlock =
7377- static_cast <int >(iSampleBlock / nBlocksPerRow );
7455+ static_cast <int >(iSampleBlock / nChunksPerRow );
73787456 const int iXBlock =
7379- static_cast <int >(iSampleBlock % nBlocksPerRow );
7457+ static_cast <int >(iSampleBlock % nChunksPerRow );
73807458
7381- GDALRasterBlock * const poBlock =
7382- GetLockedBlockRef ( iXBlock, iYBlock );
7383- if (poBlock == nullptr )
7384- return CE_Failure ;
7459+ const int nXCheck =
7460+ std::min (nRasterXSize - nChunkXSize * iXBlock, nChunkXSize );
7461+ const int nYCheck =
7462+ std::min (nRasterYSize - nChunkYSize * iYBlock, nChunkYSize) ;
73857463
7386- void *const pData = poBlock->GetDataRef ();
7464+ GDALRasterBlock *poBlock = nullptr ;
7465+ if (pabyTemp)
7466+ {
7467+ if (RasterIO (GF_Read, iXBlock * nChunkXSize,
7468+ iYBlock * nChunkYSize, nXCheck, nYCheck,
7469+ pabyTemp, nXCheck, nYCheck, eDataType, 0 ,
7470+ static_cast <GSpacing>(nChunkXSize * nDTSize),
7471+ nullptr ) != CE_None)
7472+ {
7473+ return CE_Failure;
7474+ }
7475+ }
7476+ else
7477+ {
7478+ poBlock = GetLockedBlockRef (iXBlock, iYBlock);
7479+ if (poBlock == nullptr )
7480+ {
7481+ return CE_Failure;
7482+ }
7483+ }
73877484
7388- int nXCheck = 0 , nYCheck = 0 ;
7389- GetActualBlockSize (iXBlock, iYBlock, &nXCheck, &nYCheck) ;
7485+ const void * const pData =
7486+ poBlock ? poBlock-> GetDataRef () : pabyTemp ;
73907487
73917488 GUIntBig nBlockSum = 0 ;
73927489 GUIntBig nBlockSumSquare = 0 ;
@@ -7404,7 +7501,7 @@ CPLErr GDALRasterBand::ComputeStatistics(int bApproxOK, double *pdfMin,
74047501 {
74057502 ComputeStatisticsInternal<
74067503 GByte, /* COMPUTE_OTHER_STATS = */ true >::
7407- f (nXCheck, nBlockXSize , nYCheck,
7504+ f (nXCheck, nChunkXSize , nYCheck,
74087505 static_cast <const GByte *>(pData),
74097506 nNoDataValue <= nMaxValueType, nNoDataValue, nMin,
74107507 nMax, nBlockSumRef, nBlockSumSquareRef,
@@ -7414,14 +7511,15 @@ CPLErr GDALRasterBand::ComputeStatistics(int bApproxOK, double *pdfMin,
74147511 {
74157512 ComputeStatisticsInternal<
74167513 GUInt16, /* COMPUTE_OTHER_STATS = */ true >::
7417- f (nXCheck, nBlockXSize , nYCheck,
7514+ f (nXCheck, nChunkXSize , nYCheck,
74187515 static_cast <const GUInt16 *>(pData),
74197516 nNoDataValue <= nMaxValueType, nNoDataValue, nMin,
74207517 nMax, nBlockSumRef, nBlockSumSquareRef,
74217518 nBlockSampleCountRef, nBlockValidCountRef);
74227519 }
74237520
7424- poBlock->DropLock ();
7521+ if (poBlock)
7522+ poBlock->DropLock ();
74257523
74267524 if (!bIntegerStats)
74277525 {
@@ -7457,8 +7555,8 @@ CPLErr GDALRasterBand::ComputeStatistics(int bApproxOK, double *pdfMin,
74577555 }
74587556
74597557 if (!pfnProgress (static_cast <double >(iSampleBlock) /
7460- (static_cast <double >(nBlocksPerRow ) *
7461- nBlocksPerColumn ),
7558+ (static_cast <double >(nChunksPerRow ) *
7559+ nChunksPerCol ),
74627560 " Compute Statistics" , pProgressData))
74637561 {
74647562 ReportError (CE_Failure, CPLE_UserInterrupt,
0 commit comments