Skip to content

Commit 5efa33c

Browse files
committed
ComputeStatistics(): multithreaded (I/O) optimization for Byte/UInt16 cases
for drivers (such as GTiff) that can do multithreaded decoded of a multi-block area of interest On a 50k x 50k Byte GTiff LZW-compressed tiled dataset, reduces the computation time from 4.0 s to 1.7 s with 6 threads. Fixes OSGeo#13671
1 parent f2bfd56 commit 5efa33c

File tree

2 files changed

+158
-27
lines changed

2 files changed

+158
-27
lines changed

autotest/gcore/gdal_stats.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1142,6 +1142,39 @@ def test_stats_float32_check_bugfix_13543(tmp_vsimem, GDAL_STATS_USE_FLOAT32_OPT
11421142
###############################################################################
11431143

11441144

1145+
@pytest.mark.parametrize("dt", [gdal.GDT_Byte, gdal.GDT_UInt16])
1146+
@pytest.mark.parametrize("GDAL_NUM_THREADS", [None, "ALL_CPUS"])
1147+
def test_stats_byte_uint16_optim_threads(tmp_vsimem, dt, GDAL_NUM_THREADS):
1148+
1149+
gdal.Translate(
1150+
tmp_vsimem / "test.tif",
1151+
"data/byte.tif",
1152+
outputType=dt,
1153+
width=1000,
1154+
height=500,
1155+
creationOptions=["TILED=YES", "COMPRESS=LZW"],
1156+
)
1157+
1158+
with gdal.config_option("GDAL_NUM_THREADS", GDAL_NUM_THREADS):
1159+
ds = gdal.Open(tmp_vsimem / "test.tif")
1160+
got_stats = ds.GetRasterBand(1).ComputeStatistics(False)
1161+
expected_stats = [74.0, 255.0, 126.765, 22.928470838675658]
1162+
assert got_stats == pytest.approx(expected_stats)
1163+
1164+
# Test I/O error
1165+
f = gdal.VSIFOpenL(tmp_vsimem / "test.tif", "rb+")
1166+
gdal.VSIFTruncateL(f, 1000)
1167+
gdal.VSIFCloseL(f)
1168+
1169+
with gdal.config_option("GDAL_NUM_THREADS", GDAL_NUM_THREADS):
1170+
ds = gdal.Open(tmp_vsimem / "test.tif")
1171+
with pytest.raises(Exception):
1172+
ds.GetRasterBand(1).ComputeStatistics(False)
1173+
1174+
1175+
###############################################################################
1176+
1177+
11451178
@pytest.mark.parametrize(
11461179
"dt,struct_fmt",
11471180
[

gcore/gdalrasterband.cpp

Lines changed: 125 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -5764,6 +5764,8 @@ template <class T, bool COMPUTE_OTHER_STATS> struct ComputeStatisticsInternal
57645764
}
57655765
};
57665766

5767+
constexpr int ALIGNMENT_AVX2_OPTIM = 32;
5768+
57675769
#if (defined(__x86_64__) || defined(_M_X64) || \
57685770
defined(USE_NEON_OPTIMIZATIONS)) && \
57695771
(defined(__GNUC__) || defined(_MSC_VER))
@@ -5781,23 +5783,28 @@ ComputeStatisticsByteNoNodata(GPtrDiff_t nBlockPixels,
57815783
GUIntBig &nSampleCount, GUIntBig &nValidCount)
57825784
{
57835785
// 32-byte alignment may not be enforced by linker, so do it at hand
5784-
GByte
5785-
aby32ByteUnaligned[32 + 32 + 32 + (COMPUTE_OTHER_STATS ? 32 + 32 : 0)];
5786+
GByte aby32ByteUnaligned[ALIGNMENT_AVX2_OPTIM + ALIGNMENT_AVX2_OPTIM +
5787+
ALIGNMENT_AVX2_OPTIM +
5788+
(COMPUTE_OTHER_STATS
5789+
? ALIGNMENT_AVX2_OPTIM + ALIGNMENT_AVX2_OPTIM
5790+
: 0)];
57865791
GByte *paby32ByteAligned =
57875792
aby32ByteUnaligned +
5788-
(32 - (reinterpret_cast<GUIntptr_t>(aby32ByteUnaligned) % 32));
5793+
(ALIGNMENT_AVX2_OPTIM -
5794+
(reinterpret_cast<GUIntptr_t>(aby32ByteUnaligned) %
5795+
ALIGNMENT_AVX2_OPTIM));
57895796
GByte *pabyMin = paby32ByteAligned;
5790-
GByte *pabyMax = paby32ByteAligned + 32;
5791-
GUInt32 *panSum =
5792-
COMPUTE_OTHER_STATS
5793-
? reinterpret_cast<GUInt32 *>(paby32ByteAligned + 32 * 2)
5794-
: nullptr;
5797+
GByte *pabyMax = paby32ByteAligned + ALIGNMENT_AVX2_OPTIM;
5798+
GUInt32 *panSum = COMPUTE_OTHER_STATS
5799+
? reinterpret_cast<GUInt32 *>(
5800+
paby32ByteAligned + ALIGNMENT_AVX2_OPTIM * 2)
5801+
: nullptr;
57955802
GUInt32 *panSumSquare =
5796-
COMPUTE_OTHER_STATS
5797-
? reinterpret_cast<GUInt32 *>(paby32ByteAligned + 32 * 3)
5798-
: nullptr;
5803+
COMPUTE_OTHER_STATS ? reinterpret_cast<GUInt32 *>(
5804+
paby32ByteAligned + ALIGNMENT_AVX2_OPTIM * 3)
5805+
: nullptr;
57995806

5800-
CPLAssert((reinterpret_cast<uintptr_t>(pData) % 32) == 0);
5807+
CPLAssert((reinterpret_cast<uintptr_t>(pData) % ALIGNMENT_AVX2_OPTIM) == 0);
58015808

58025809
GPtrDiff_t i = 0;
58035810
// Make sure that sumSquare can fit on uint32
@@ -7368,25 +7375,115 @@ CPLErr GDALRasterBand::ComputeStatistics(int bApproxOK, double *pdfMin,
73687375
? static_cast<GUInt32>(sNoDataValues.dfNoDataValue + 1e-10)
73697376
: nMaxValueType + 1;
73707377

7378+
int nChunkXSize = nBlockXSize;
7379+
int nChunkYSize = nBlockYSize;
7380+
int nChunksPerRow = nBlocksPerRow;
7381+
int nChunksPerCol = nBlocksPerColumn;
7382+
7383+
int nThreads = 1;
7384+
if (nChunkYSize > 1)
7385+
{
7386+
const char *pszNumThreads =
7387+
CPLGetConfigOption("GDAL_NUM_THREADS", nullptr);
7388+
if (pszNumThreads)
7389+
{
7390+
if (EQUAL(pszNumThreads, "ALL_CPUS"))
7391+
nThreads = CPLGetNumCPUs();
7392+
else
7393+
nThreads =
7394+
std::clamp(atoi(pszNumThreads), 1, CPLGetNumCPUs());
7395+
}
7396+
}
7397+
7398+
int nNewChunkXSize = nChunkXSize;
7399+
const int nDTSize = GDALGetDataTypeSizeBytes(eDataType);
7400+
if (!bApproxOK && nThreads > 1 &&
7401+
MayMultiBlockReadingBeMultiThreaded())
7402+
{
7403+
const int64_t nRAMAmount = CPLGetUsablePhysicalRAM() / 10;
7404+
const size_t nChunkPixels =
7405+
static_cast<size_t>(nChunkXSize) * nChunkYSize;
7406+
if (nRAMAmount > 0 &&
7407+
nChunkPixels <=
7408+
std::numeric_limits<size_t>::max() / nDTSize)
7409+
{
7410+
const size_t nBlockSize = nDTSize * nChunkPixels;
7411+
const int64_t nBlockCount = nRAMAmount / nBlockSize;
7412+
if (nBlockCount >= 2)
7413+
{
7414+
nNewChunkXSize = static_cast<int>(std::min<int64_t>(
7415+
nChunkXSize * std::min<int64_t>(
7416+
nBlockCount,
7417+
(std::numeric_limits<int>::max() -
7418+
ALIGNMENT_AVX2_OPTIM) /
7419+
nChunkPixels),
7420+
nRasterXSize));
7421+
7422+
CPLAssert(nChunkXSize <
7423+
std::numeric_limits<int>::max() /
7424+
nChunkYSize);
7425+
}
7426+
}
7427+
}
7428+
7429+
std::unique_ptr<GByte, VSIFreeReleaser> pabyTempUnaligned;
7430+
GByte *pabyTemp = nullptr;
7431+
if (nNewChunkXSize != nBlockXSize)
7432+
{
7433+
pabyTempUnaligned.reset(static_cast<GByte *>(
7434+
VSIMalloc(nDTSize * nNewChunkXSize * nChunkYSize +
7435+
ALIGNMENT_AVX2_OPTIM)));
7436+
if (pabyTempUnaligned)
7437+
{
7438+
pabyTemp = reinterpret_cast<GByte *>(
7439+
reinterpret_cast<uintptr_t>(pabyTempUnaligned.get()) +
7440+
(ALIGNMENT_AVX2_OPTIM -
7441+
(reinterpret_cast<uintptr_t>(pabyTempUnaligned.get()) %
7442+
ALIGNMENT_AVX2_OPTIM)));
7443+
nChunkXSize = nNewChunkXSize;
7444+
nChunksPerRow =
7445+
cpl::div_round_up(nRasterXSize, nChunkXSize);
7446+
}
7447+
}
7448+
73717449
for (GIntBig iSampleBlock = 0;
73727450
iSampleBlock <
7373-
static_cast<GIntBig>(nBlocksPerRow) * nBlocksPerColumn;
7451+
static_cast<GIntBig>(nChunksPerRow) * nChunksPerCol;
73747452
iSampleBlock += nSampleRate)
73757453
{
73767454
const int iYBlock =
7377-
static_cast<int>(iSampleBlock / nBlocksPerRow);
7455+
static_cast<int>(iSampleBlock / nChunksPerRow);
73787456
const int iXBlock =
7379-
static_cast<int>(iSampleBlock % nBlocksPerRow);
7457+
static_cast<int>(iSampleBlock % nChunksPerRow);
73807458

7381-
GDALRasterBlock *const poBlock =
7382-
GetLockedBlockRef(iXBlock, iYBlock);
7383-
if (poBlock == nullptr)
7384-
return CE_Failure;
7459+
const int nXCheck =
7460+
std::min(nRasterXSize - nChunkXSize * iXBlock, nChunkXSize);
7461+
const int nYCheck =
7462+
std::min(nRasterYSize - nChunkYSize * iYBlock, nChunkYSize);
73857463

7386-
void *const pData = poBlock->GetDataRef();
7464+
GDALRasterBlock *poBlock = nullptr;
7465+
if (pabyTemp)
7466+
{
7467+
if (RasterIO(GF_Read, iXBlock * nChunkXSize,
7468+
iYBlock * nChunkYSize, nXCheck, nYCheck,
7469+
pabyTemp, nXCheck, nYCheck, eDataType, 0,
7470+
static_cast<GSpacing>(nChunkXSize * nDTSize),
7471+
nullptr) != CE_None)
7472+
{
7473+
return CE_Failure;
7474+
}
7475+
}
7476+
else
7477+
{
7478+
poBlock = GetLockedBlockRef(iXBlock, iYBlock);
7479+
if (poBlock == nullptr)
7480+
{
7481+
return CE_Failure;
7482+
}
7483+
}
73877484

7388-
int nXCheck = 0, nYCheck = 0;
7389-
GetActualBlockSize(iXBlock, iYBlock, &nXCheck, &nYCheck);
7485+
const void *const pData =
7486+
poBlock ? poBlock->GetDataRef() : pabyTemp;
73907487

73917488
GUIntBig nBlockSum = 0;
73927489
GUIntBig nBlockSumSquare = 0;
@@ -7404,7 +7501,7 @@ CPLErr GDALRasterBand::ComputeStatistics(int bApproxOK, double *pdfMin,
74047501
{
74057502
ComputeStatisticsInternal<
74067503
GByte, /* COMPUTE_OTHER_STATS = */ true>::
7407-
f(nXCheck, nBlockXSize, nYCheck,
7504+
f(nXCheck, nChunkXSize, nYCheck,
74087505
static_cast<const GByte *>(pData),
74097506
nNoDataValue <= nMaxValueType, nNoDataValue, nMin,
74107507
nMax, nBlockSumRef, nBlockSumSquareRef,
@@ -7414,14 +7511,15 @@ CPLErr GDALRasterBand::ComputeStatistics(int bApproxOK, double *pdfMin,
74147511
{
74157512
ComputeStatisticsInternal<
74167513
GUInt16, /* COMPUTE_OTHER_STATS = */ true>::
7417-
f(nXCheck, nBlockXSize, nYCheck,
7514+
f(nXCheck, nChunkXSize, nYCheck,
74187515
static_cast<const GUInt16 *>(pData),
74197516
nNoDataValue <= nMaxValueType, nNoDataValue, nMin,
74207517
nMax, nBlockSumRef, nBlockSumSquareRef,
74217518
nBlockSampleCountRef, nBlockValidCountRef);
74227519
}
74237520

7424-
poBlock->DropLock();
7521+
if (poBlock)
7522+
poBlock->DropLock();
74257523

74267524
if (!bIntegerStats)
74277525
{
@@ -7457,8 +7555,8 @@ CPLErr GDALRasterBand::ComputeStatistics(int bApproxOK, double *pdfMin,
74577555
}
74587556

74597557
if (!pfnProgress(static_cast<double>(iSampleBlock) /
7460-
(static_cast<double>(nBlocksPerRow) *
7461-
nBlocksPerColumn),
7558+
(static_cast<double>(nChunksPerRow) *
7559+
nChunksPerCol),
74627560
"Compute Statistics", pProgressData))
74637561
{
74647562
ReportError(CE_Failure, CPLE_UserInterrupt,

0 commit comments

Comments
 (0)