From aa69a6e9419151093cba7a65da0e9c8757b1996c Mon Sep 17 00:00:00 2001 From: Michael Sumner Date: Fri, 15 May 2026 12:05:33 +0000 Subject: [PATCH 1/2] Add gdal mdim get-refs algorithm --- apps/CMakeLists.txt | 1 + apps/gdalalg_mdim.cpp | 2 + apps/gdalalg_mdim_get_refs.cpp | 403 ++++++++++++++++++ apps/gdalalg_mdim_get_refs.h | 50 +++ apps/gdalalg_mdim_get_refs_common.h | 125 ++++++ autotest/utilities/data/mdim_zarr.vrt | 32 ++ .../utilities/test_gdalalg_mdim_get_refs.py | 321 ++++++++++++++ doc/source/programs/gdal_mdim_get_refs.rst | 190 +++++++++ 8 files changed, 1124 insertions(+) create mode 100644 apps/gdalalg_mdim_get_refs.cpp create mode 100644 apps/gdalalg_mdim_get_refs.h create mode 100644 apps/gdalalg_mdim_get_refs_common.h create mode 100644 autotest/utilities/data/mdim_zarr.vrt create mode 100644 autotest/utilities/test_gdalalg_mdim_get_refs.py create mode 100644 doc/source/programs/gdal_mdim_get_refs.rst diff --git a/apps/CMakeLists.txt b/apps/CMakeLists.txt index d9b156cd81a0..3b6af99cc653 100644 --- a/apps/CMakeLists.txt +++ b/apps/CMakeLists.txt @@ -43,6 +43,7 @@ target_sources(appslib PRIVATE gdalalg_mdim.cpp gdalalg_mdim_info.cpp gdalalg_mdim_convert.cpp + gdalalg_mdim_get_refs.cpp gdalalg_mdim_mosaic.cpp gdalalg_pipeline.cpp gdalalg_raster.cpp diff --git a/apps/gdalalg_mdim.cpp b/apps/gdalalg_mdim.cpp index ec52b09dd79b..e67c13dd155f 100644 --- a/apps/gdalalg_mdim.cpp +++ b/apps/gdalalg_mdim.cpp @@ -17,6 +17,7 @@ #include "gdalalg_mdim_info.h" #include "gdalalg_mdim_convert.h" #include "gdalalg_mdim_mosaic.h" +#include "gdalalg_mdim_get_refs.h" #include "gdal_priv.h" @@ -40,6 +41,7 @@ GDALMdimAlgorithm::GDALMdimAlgorithm() RegisterSubAlgorithm(); RegisterSubAlgorithm(); RegisterSubAlgorithm(); + RegisterSubAlgorithm(); } bool GDALMdimAlgorithm::RunImpl(GDALProgressFunc, void *) diff --git a/apps/gdalalg_mdim_get_refs.cpp b/apps/gdalalg_mdim_get_refs.cpp new file mode 100644 index 000000000000..b9d8520e3e17 --- /dev/null +++ b/apps/gdalalg_mdim_get_refs.cpp @@ -0,0 +1,403 @@ +/****************************************************************************** + * + * Project: GDAL + * Purpose: gdal "mdim get-refs" subcommand + * Author: Michael Sumner + * + ****************************************************************************** + * Copyright (c) 2026, Michael Sumner + * + * SPDX-License-Identifier: MIT + ****************************************************************************/ + +#include "gdalalg_mdim_get_refs.h" +#include "cpl_conv.h" +#include "gdal_priv.h" +#include "ogrsf_frmts.h" +#include "gdalalg_mdim_get_refs_common.h" + +//! @cond Doxygen_Suppress + +#ifndef _ +#define _(x) (x) +#endif + +namespace +{ + +// Format a vector of integers as "[a, b, c]" for debug messages. +std::string FormatVec(const std::vector &v) +{ + std::string s = "["; + for (size_t i = 0; i < v.size(); ++i) + { + if (i > 0) + s += ", "; + s += std::to_string(v[i]); + } + s += "]"; + return s; +} + +} // namespace + +/************************************************************************/ +/* GDALMdimGetRefsAlgorithm() */ +/************************************************************************/ + +GDALMdimGetRefsAlgorithm::GDALMdimGetRefsAlgorithm() + : GDALAlgorithm(NAME, DESCRIPTION, HELP_URL) +{ + AddProgressArg(); + AddOutputFormatArg(&m_outputFormat, /* bStreamAllowed = */ false, + /* bGDALGAllowed = */ false) + .AddMetadataItem(GAAMDI_REQUIRED_CAPABILITIES, + {GDAL_DCAP_VECTOR, GDAL_DCAP_CREATE}) + .SetRequired(); + AddOpenOptionsArg(&m_openOptions); + AddInputFormatsArg(&m_inputFormats) + .AddMetadataItem(GAAMDI_REQUIRED_CAPABILITIES, + {GDAL_DCAP_MULTIDIM_RASTER}); + AddInputDatasetArg(&m_inputDataset, GDAL_OF_MULTIDIM_RASTER) + .AddAlias("dataset"); + AddOutputDatasetArg(&m_outputDataset, GDAL_OF_VECTOR) + .SetDatasetInputFlags(GADV_NAME | GADV_OBJECT); + AddArrayNameArg(&m_array, _("Name of the array, used to restrict the " + "output to the specified array.")) + .SetRequired(); + AddOverwriteArg(&m_overwrite); + AddCreationOptionsArg(&m_creationOptions); +} + +bool GDALMdimGetRefsAlgorithm::RunImpl(GDALProgressFunc pfnProgress, + void *pProgressData) +{ + // ---------------------------------------------------------------------- + // STAGE A — resolve the input array + // ---------------------------------------------------------------------- + // A1. Get the input GDALDataset from m_inputDataset (already opened by the + // framework as OF_MULTIDIM_RASTER — confirm footprint/convert rely on + // the framework open rather than re-opening). GetDatasetRef(). + + auto poSrcDS = m_inputDataset.GetDatasetRef(); + CPLAssert(poSrcDS); + + // A2. GetRootGroup(). Null root => driver lacks mdim support => fail with a + // clear CPLError, return false. + auto poRootGroup = poSrcDS->GetRootGroup(); + CPLDebug("MDIM-GET-REFS", "input: %s, root group: %s", + poSrcDS->GetDescription(), poRootGroup ? "present" : "NULL"); + + if (!poRootGroup) + { + ReportError(CE_Failure, CPLE_AppDefined, + "Dataset %s has no root group (not multidimensional?)", + poSrcDS->GetDescription()); + return false; + } + + auto poArray = poRootGroup->OpenMDArrayFromFullname(m_array); + if (!poArray) + { + ReportError(CE_Failure, CPLE_AppDefined, + "Cannot find array %s in dataset %s. \n" + "Use 'gdal mdim info %s' to list available arrays.", + m_array.c_str(), poSrcDS->GetDescription(), + poSrcDS->GetDescription()); + return false; + } + + const std::vector> apoDims = + poArray->GetDimensions(); + + const auto anBlockSize = poArray->GetBlockSize(); + std::vector anBlockSizeU64(anBlockSize.begin(), + anBlockSize.end()); + CPLAssert(anBlockSize.size() == poArray->GetDimensionCount()); + for (size_t i = 0; i < anBlockSizeU64.size(); ++i) + { + if (anBlockSize[i] == 0) + { + ReportError(CE_Failure, CPLE_AppDefined, + "Array %s has no natural block size on dimension %zu; " + "not chunk-enumerable", + m_array.c_str(), i); + return false; + } + } + + const auto &dt = poArray->GetDataType(); + GDALDataType nDataType = dt.GetNumericDataType(); + if (nDataType == GDT_Unknown) + { + ReportError( + CE_Failure, CPLE_AppDefined, + "Array %s has non-numeric or unknown data type; not supported", + m_array.c_str()); + return false; + } + const char *dt_name = GDALGetDataTypeName(nDataType); + + // Build the dim-size vector (needed by ComputeChunkGrid and for debug) + std::vector anDimSize(apoDims.size()); + for (size_t i = 0; i < apoDims.size(); ++i) + anDimSize[i] = apoDims[i]->GetSize(); + + // Stage C: chunk grid via the helper + std::vector n_chunks; + const size_t nTotalChunks = + get_refs::ComputeChunkGrid(anDimSize, anBlockSizeU64, n_chunks); + + CPLDebug( + "MDIM-GET-REFS", + "array %s: dims=[%s], blocks=[%s], chunks=[%s], total=%zu, dtype=%s", + m_array.c_str(), FormatVec(anDimSize).c_str(), + FormatVec(anBlockSizeU64).c_str(), FormatVec(n_chunks).c_str(), + nTotalChunks, dt_name); + + const std::string osOutputPath = m_outputDataset.GetName(); + + GDALDriver *poDriver = + GetGDALDriverManager()->GetDriverByName(m_outputFormat.c_str()); + if (!poDriver) + { + ReportError(CE_Failure, CPLE_AppDefined, + "Cannot find vector driver '%s' for output dataset. " + "Use 'gdal --formats' to list available drivers.", + m_outputFormat.c_str()); + return false; + } + + auto poDstDS = std::unique_ptr( + poDriver->Create(osOutputPath.c_str(), 0, 0, 0, GDT_Unknown, + CPLStringList(m_creationOptions).List())); + if (!poDstDS) + { + return false; + } + + std::string osLayerName = m_array; + const auto nLastSlash = osLayerName.rfind('/'); + if (nLastSlash != std::string::npos) + osLayerName = osLayerName.substr(nLastSlash + 1); + + OGRLayer *poLayer = + poDstDS->CreateLayer(osLayerName.c_str(), nullptr, wkbNone, nullptr); + if (!poLayer) + { + ReportError(CE_Failure, CPLE_AppDefined, + "Cannot create layer '%s' in output dataset '%s'", + osLayerName.c_str(), osOutputPath.c_str()); + return false; + } + + // D6. Per-dimension fields: dim_0, dim_1, ... as OFTInteger64 + // (names live in layer metadata, not field names — keeps the schema + // identical-shape across arrays, sidesteps sanitization). + for (size_t i = 0; i < apoDims.size(); ++i) + { + OGRFieldDefn oField(CPLSPrintf("dim_%zu", i), OFTInteger64); + if (poLayer->CreateField(&oField) != OGRERR_NONE) + { + ReportError(CE_Failure, CPLE_AppDefined, + "Cannot create field 'dim_%zu'", i); + return false; + } + } + + // D7. Generic fields per RFC Stage 1 schema. + OGRFieldDefn oPresentField("present", OFTInteger); + oPresentField.SetSubType(OFSTBoolean); + if (poLayer->CreateField(&oPresentField) != OGRERR_NONE) + { + ReportError(CE_Failure, CPLE_AppDefined, + "Cannot create field 'present'"); + return false; + } + + OGRFieldDefn oPathField("path", OFTString); + oPathField.SetNullable(TRUE); + if (poLayer->CreateField(&oPathField) != OGRERR_NONE) + { + ReportError(CE_Failure, CPLE_AppDefined, "Cannot create field 'path'"); + return false; + } + + OGRFieldDefn oOffsetField("offset", OFTInteger64); + oOffsetField.SetNullable(TRUE); + if (poLayer->CreateField(&oOffsetField) != OGRERR_NONE) + { + ReportError(CE_Failure, CPLE_AppDefined, + "Cannot create field 'offset'"); + return false; + } + + OGRFieldDefn oSizeField("size", OFTInteger64); + oSizeField.SetNullable(TRUE); + if (poLayer->CreateField(&oSizeField) != OGRERR_NONE) + { + ReportError(CE_Failure, CPLE_AppDefined, "Cannot create field 'size'"); + return false; + } + + OGRFieldDefn oInfoField("info", OFTString); + oInfoField.SetNullable(TRUE); + if (poLayer->CreateField(&oInfoField) != OGRERR_NONE) + { + ReportError(CE_Failure, CPLE_AppDefined, "Cannot create field 'info'"); + return false; + } + + // D8. Array-level metadata on the layer. + poLayer->SetMetadataItem("ARRAY_NAME", m_array.c_str()); + poLayer->SetMetadataItem("DTYPE", dt_name); + for (size_t i = 0; i < apoDims.size(); ++i) + { + poLayer->SetMetadataItem(CPLSPrintf("DIM_%zu_NAME", i), + apoDims[i]->GetName().c_str()); + poLayer->SetMetadataItem( + CPLSPrintf("DIM_%zu_SIZE", i), + CPLSPrintf(CPL_FRMT_GUIB, + static_cast(apoDims[i]->GetSize()))); + poLayer->SetMetadataItem( + CPLSPrintf("DIM_%zu_BLOCK", i), + CPLSPrintf(CPL_FRMT_GUIB, static_cast(anBlockSize[i]))); + poLayer->SetMetadataItem(CPLSPrintf("DIM_%zu_CHUNKS", i), + CPLSPrintf("%zu", n_chunks[i])); + } + + CPLDebug("MDIM-GET-REFS", + "created layer '%s' with %d fields, ready for %zu features", + osLayerName.c_str(), poLayer->GetLayerDefn()->GetFieldCount(), + nTotalChunks); + + std::vector coords( + apoDims.size()); // reused, inside LinearToCoords + GDALMDArrayRawBlockInfo info; // reused, .clear() per iteration + + // Loop over chunks + const size_t nProgressInterval = std::max(1, nTotalChunks / 100); + bool bCodecHoisted = false; + for (size_t iLinear = 0; iLinear < nTotalChunks; ++iLinear) + { + info.clear(); + get_refs::LinearToCoords(iLinear, n_chunks, coords); + if (!poArray->GetRawBlockInfo(coords.data(), info)) + { + ReportError(CE_Failure, CPLE_AppDefined, + "GetRawBlockInfo failed at linear index %zu", iLinear); + return false; + } + if (iLinear < 3 || iLinear == nTotalChunks - 1) + { + CPLDebug("MDIM-GET-REFS", + "chunk %zu coords=[%s] path=%s offset=" CPL_FRMT_GUIB + " size=" CPL_FRMT_GUIB, + iLinear, FormatVec(coords).c_str(), + info.pszFilename ? info.pszFilename : "(null)", + static_cast(info.nOffset), + static_cast(info.nSize)); + } + OGRFeature *poFeature = + OGRFeature::CreateFeature(poLayer->GetLayerDefn()); + + // Per-dim coordinates: dim_0 .. dim_{n-1} + for (size_t i = 0; i < coords.size(); ++i) + poFeature->SetField(static_cast(i), + static_cast(coords[i])); + + // Three-state classification — exactly the Commit 1 reconciliation + const int iPresentField = static_cast(coords.size()); + const int iPathField = iPresentField + 1; + const int iOffsetField = iPresentField + 2; + const int iSizeField = iPresentField + 3; + const int iInfoField = iPresentField + 4; + + if (info.pszFilename != nullptr) + { + // present (file-backed) + poFeature->SetField(iPresentField, 1); + poFeature->SetField(iPathField, info.pszFilename); + poFeature->SetField(iOffsetField, + static_cast(info.nOffset)); + poFeature->SetField(iSizeField, static_cast(info.nSize)); + } + else if (info.pabyInlineData != nullptr) + { + // inline — Stage 1 reports size but not bytes + // (note: classify by pabyInlineData, not nSize > 0 — Commit 1) + poFeature->SetField(iPresentField, 1); + // path and offset left null (default state) + poFeature->SetField(iSizeField, static_cast(info.nSize)); + } + else + { + // absent (sparse) + poFeature->SetField(iPresentField, 0); + // path, offset, size all left null + } + + // info (papszInfo joined) — applies to all three states when non-null + if (info.papszInfo != nullptr) + { + CPLString osJoined; + for (int i = 0; info.papszInfo[i] != nullptr; ++i) + { + if (i > 0) + osJoined += "; "; + osJoined += info.papszInfo[i]; + } + poFeature->SetField(iInfoField, osJoined.c_str()); + } + + // Codec hoist to layer metadata on the first successful file-backed chunk. + if (!bCodecHoisted && info.papszInfo != nullptr && + info.pszFilename != nullptr) + { + for (int i = 0; info.papszInfo[i] != nullptr; ++i) + { + char *pszKey = nullptr; + const char *pszValue = + CPLParseNameValue(info.papszInfo[i], &pszKey); + if (pszKey && pszValue) + poLayer->SetMetadataItem( + CPLString().Printf("CODEC_%s", pszKey).c_str(), + pszValue); + CPLFree(pszKey); + } + bCodecHoisted = true; + } + + if (poLayer->CreateFeature(poFeature) != OGRERR_NONE) + { + ReportError(CE_Failure, CPLE_AppDefined, + "Cannot write feature for chunk %zu", iLinear); + OGRFeature::DestroyFeature(poFeature); + return false; + } + OGRFeature::DestroyFeature(poFeature); + + // progress + if (pfnProgress && (iLinear % nProgressInterval == 0)) + { + const double dfFraction = static_cast(iLinear) / + static_cast(nTotalChunks); + if (!pfnProgress(dfFraction, nullptr, pProgressData)) + { + ReportError(CE_Failure, CPLE_UserInterrupt, + "User interrupted at chunk %zu of %zu", iLinear, + nTotalChunks); + return false; + } + } + } + // Final progress tick + if (pfnProgress) + pfnProgress(1.0, nullptr, pProgressData); + + m_outputDataset.Set(std::move(poDstDS)); + + return true; +} + +//! @cond Doxygen_Suppress diff --git a/apps/gdalalg_mdim_get_refs.h b/apps/gdalalg_mdim_get_refs.h new file mode 100644 index 000000000000..1298a0e67a34 --- /dev/null +++ b/apps/gdalalg_mdim_get_refs.h @@ -0,0 +1,50 @@ +/****************************************************************************** + * + * Project: GDAL + * Purpose: gdal "mdim get-refs" subcommand + * Author: Michael Sumner + * + ****************************************************************************** + * Copyright (c) 2026, Michael Sumner + * + * SPDX-License-Identifier: MIT + ****************************************************************************/ + +#ifndef GDALALG_MDIM_GET_REFS_INCLUDED +#define GDALALG_MDIM_GET_REFS_INCLUDED + +#include "gdalalgorithm.h" + +//! @cond Doxygen_Suppress + +/************************************************************************/ +/* GDALMdimGetRefsAlgorithm */ +/************************************************************************/ + +class GDALMdimGetRefsAlgorithm final : public GDALAlgorithm +{ + public: + static constexpr const char *NAME = "get-refs"; + static constexpr const char *DESCRIPTION = + "Return byte references from a multidimensional raster source as " + "vector/table layer."; + static constexpr const char *HELP_URL = "/programs/gdal_mdim_get_refs.html"; + + explicit GDALMdimGetRefsAlgorithm(); + + private: + bool RunImpl(GDALProgressFunc pfnProgress, void *pProgressData) override; + + std::string m_outputFormat{}; + GDALArgDatasetValue m_inputDataset{}; + std::vector m_openOptions{}; + std::vector m_inputFormats{}; + std::string m_array{}; + GDALArgDatasetValue m_outputDataset{}; + bool m_overwrite = false; + std::vector m_creationOptions{}; +}; + +//! @endcond + +#endif diff --git a/apps/gdalalg_mdim_get_refs_common.h b/apps/gdalalg_mdim_get_refs_common.h new file mode 100644 index 000000000000..da7df7730a5b --- /dev/null +++ b/apps/gdalalg_mdim_get_refs_common.h @@ -0,0 +1,125 @@ +/****************************************************************************** + * + * Project: GDAL + * Purpose: Shared helpers for "gdal mdim get-refs" + * Author: Michael Sumner + * + ****************************************************************************** + * Copyright (c) 2026, Michael Sumner + * + * SPDX-License-Identifier: MIT + ****************************************************************************/ +#ifndef GDALALG_MDIM_GET_REFS_COMMON_INCLUDED +#define GDALALG_MDIM_GET_REFS_COMMON_INCLUDED + +#include +#include +#include + +#include "cpl_error.h" + +//! @cond Doxygen_Suppress + +namespace get_refs +{ + +/************************************************************************/ +/* LinearToCoords() */ +/************************************************************************/ +/** + * Decode a linear chunk index into per-dimension chunk coordinates, + * using row-major (last dimension varies fastest) ordering. + * + * Given a chunk grid of shape n_chunks = [N_0, N_1, ..., N_{k-1}] and a + * linear index iLinear in [0, product(n_chunks)), this fills coords with + * the [c_0, c_1, ..., c_{k-1}] such that + * iLinear = (((c_0 * N_1) + c_1) * N_2 + c_2) * ... + c_{k-1}. + * + * The output is uint64_t to match GDALMDArray::GetRawBlockInfo()'s + * coordinate-pointer parameter directly — no further conversion needed + * at the call site. + * + * @param iLinear The flat chunk index. Must be < product(n_chunks); + * callers should bound this with nTotalChunks. No + * internal range check (this is on the hot path). + * @param n_chunks Per-dimension chunk count. Must be non-empty and have + * no zero entries. + * @param coords Output buffer; checked to match size of n_chunks, and filled. + */ +inline void LinearToCoords(size_t iLinear, const std::vector &n_chunks, + std::vector &coords) +{ + CPLAssert(coords.size() == n_chunks.size()); + size_t remaining = iLinear; + for (size_t iDim = n_chunks.size(); iDim-- > 0;) + { + coords[iDim] = static_cast(remaining % n_chunks[iDim]); + remaining /= n_chunks[iDim]; + } +} + +/************************************************************************/ +/* CoordsToLinear() */ +/************************************************************************/ +/** + * Encode per-dimension chunk coordinates into a linear chunk index, + * inverse of LinearToCoords(). + * + * Currently unused by RunImpl which only needs the decoder direction, + * this is the pair to LinearToCoords. + * + * @param coords Per-dimension chunk coordinates. Each coords[i] must + * be < n_chunks[i]; not checked. + * @param n_chunks Per-dimension chunk count. Same size as coords. + * @return The linear index in [0, product(n_chunks)). + */ +inline size_t CoordsToLinear(const std::vector &coords, + const std::vector &n_chunks) +{ + CPLAssert(coords.size() == n_chunks.size()); + size_t iLinear = 0; + for (size_t iDim = 0; iDim < n_chunks.size(); ++iDim) + { + iLinear = iLinear * n_chunks[iDim] + static_cast(coords[iDim]); + } + return iLinear; +} + +/************************************************************************/ +/* ComputeChunkGrid() */ +/************************************************************************/ +/** + * Compute per-dimension chunk counts by ceil-division: + * n_chunks[i] = (dim_size[i] + block[i] - 1) / block[i]. + * + * Confirmed correct against ZARR, netCDF, and HDF5 drivers (Phase 0 + * evidence log Q1; HDFEOS addendum). Caller must have already verified + * that all block[i] > 0; this function will divide by zero otherwise. + * + * @param dim_size Per-dimension array size. + * @param block Per-dimension block size. block.size() == dim_size.size(). + * @param n_chunks Output; resized to dim_size.size() and filled. + * @return The total chunk count, product(n_chunks). + */ +inline size_t ComputeChunkGrid(const std::vector &dim_size, + const std::vector &block, + std::vector &n_chunks) +{ + CPLAssert(dim_size.size() == block.size()); + n_chunks.resize(dim_size.size()); + size_t nTotal = 1; + for (size_t i = 0; i < dim_size.size(); ++i) + { + CPLAssert(block[i] > 0); + n_chunks[i] = + static_cast((dim_size[i] + block[i] - 1) / block[i]); + nTotal *= n_chunks[i]; + } + return nTotal; +} + +} // namespace get_refs + +//! @endcond + +#endif // GDALALG_MDIM_GET_REFS_COMMON_INCLUDED diff --git a/autotest/utilities/data/mdim_zarr.vrt b/autotest/utilities/data/mdim_zarr.vrt new file mode 100644 index 000000000000..eb3f9002364b --- /dev/null +++ b/autotest/utilities/data/mdim_zarr.vrt @@ -0,0 +1,32 @@ + + + + + + Float64 + + 20 + degrees_north + + + + Float64 + + 40 + degrees_east + + + + Float64 + + + 20,40 + + ../gdrivers/data/zarr/array_dimensions.zarr + /var + + + + + + diff --git a/autotest/utilities/test_gdalalg_mdim_get_refs.py b/autotest/utilities/test_gdalalg_mdim_get_refs.py new file mode 100644 index 000000000000..16616bd0a5b8 --- /dev/null +++ b/autotest/utilities/test_gdalalg_mdim_get_refs.py @@ -0,0 +1,321 @@ +#!/usr/bin/env pytest +# -*- coding: utf-8 -*- +############################################################################### +# Project: GDAL/OGR Test Suite +# Purpose: 'gdal mdim get-refs' testing +# Author: Michael Sumner +# +############################################################################### +# Copyright (c) 2026, Michael Sumner +# +# SPDX-License-Identifier: MIT +############################################################################### + +import pytest +import test_cli_utilities + +from osgeo import gdal, ogr + +pytestmark = [ + pytest.mark.require_driver("VRT"), + pytest.mark.require_driver("GPKG"), + pytest.mark.skipif( + test_cli_utilities.get_gdal_path() is None, + reason="gdal binary not available", + ), +] + + +@pytest.fixture() +def gdal_path(): + return test_cli_utilities.get_gdal_path() + + +def get_mdim_get_refs_alg(): + return gdal.GetGlobalAlgorithmRegistry()["mdim"]["get-refs"] + + +def get_mdim_convert_alg(): + return gdal.GetGlobalAlgorithmRegistry()["mdim"]["convert"] + + +############################################################################### +# Algorithm-level tests, arguments, errors, framework behaviour. +############################################################################### + +# Helper to extract type+subtype for a named field +def field_info(name, defn): + i = defn.GetFieldIndex(name) + assert i >= 0, f"field {name!r} missing from layer" + fd = defn.GetFieldDefn(i) + return fd.GetType(), fd.GetSubType() + + +def test_gdalalg_mdim_get_refs_basic(tmp_vsimem): + """Round-trip a small mdim input through get-refs; verify layer + schema.""" + tmpfile = tmp_vsimem / "out.gpkg" + alg = get_mdim_get_refs_alg() + alg["array"] = "/var" + alg["output-format"] = "GPKG" + assert alg.ParseRunAndFinalize(["data/mdim_zarr.vrt", str(tmpfile)]) + ds = gdal.OpenEx(str(tmpfile), gdal.OF_VECTOR) + assert ds is not None + layer = ds.GetLayer(0) + assert layer is not None + defn = layer.GetLayerDefn() + + # dim_0, dim_1 (assuming a 2D array in the VRT) — Integer64, no subtype + assert field_info("dim_0", defn) == (ogr.OFTInteger64, ogr.OFSTNone) + assert field_info("dim_1", defn) == (ogr.OFTInteger64, ogr.OFSTNone) + + # present — Integer with Boolean subtype + assert field_info("present", defn) == (ogr.OFTInteger, ogr.OFSTBoolean) + + # path, info — plain String, no subtype + assert field_info("path", defn) == (ogr.OFTString, ogr.OFSTNone) + assert field_info("info", defn) == (ogr.OFTString, ogr.OFSTNone) + + # offset, size — Integer64, no subtype + assert field_info("offset", defn) == (ogr.OFTInteger64, ogr.OFSTNone) + assert field_info("size", defn) == (ogr.OFTInteger64, ogr.OFSTNone) + + +def test_gdalalg_mdim_get_refs_array_required(): + """--array is required at parse time; absence is rejected by the framework.""" + alg = get_mdim_get_refs_alg() + with pytest.raises(Exception, match="array"): + alg.ParseRunAndFinalize(["data/mdim_zarr.vrt", "/tmp/dummy.gpkg"]) + + +def test_gdalalg_mdim_get_refs_missing_array_path(tmp_vsimem): + """An array name that doesn't exist must fail clearly with the suggestion.""" + tmpfile = tmp_vsimem / "out.gpkg" + alg = get_mdim_get_refs_alg() + alg["array"] = "/this_array_does_not_exist" + alg["output-format"] = "GPKG" + with pytest.raises(Exception, match="Cannot find array"): + alg.ParseRunAndFinalize(["data/mdim_zarr.vrt", str(tmpfile)]) + + +def test_gdalalg_mdim_get_refs_unknown_output_format(tmp_vsimem): + """Unknown output driver must produce the helpful error wording.""" + alg = get_mdim_get_refs_alg() + alg["array"] = "/var" + with pytest.raises(RuntimeError, match="does not exist"): + alg["output-format"] = "NOT_A_DRIVER" + + +def test_gdalalg_mdim_get_refs_non_vector_output_format(tmp_vsimem): + """A raster driver passed as --of must fail (capability filter rejects).""" + alg = get_mdim_get_refs_alg() + alg["array"] = "/var" + with pytest.raises(RuntimeError, match="does not expose the required"): + alg["output-format"] = "GTiff" + + +def test_gdalalg_mdim_get_refs_overwrite(tmp_vsimem): + """--overwrite permits re-running over an existing output file.""" + tmpfile = tmp_vsimem / "out.gpkg" + + alg = get_mdim_get_refs_alg() + alg["array"] = "/var" + alg["output-format"] = "GPKG" + assert alg.ParseRunAndFinalize(["data/mdim_zarr.vrt", str(tmpfile)]) + + # Second run without --overwrite should fail with the framework's wording. + alg2 = get_mdim_get_refs_alg() + alg2["array"] = "/var" + alg2["output-format"] = "GPKG" + with pytest.raises(Exception, match="already exists"): + alg2.ParseRunAndFinalize(["data/mdim_zarr.vrt", str(tmpfile)]) + + # Third run with --overwrite should succeed. + alg3 = get_mdim_get_refs_alg() + alg3["array"] = "/var" + alg3["output-format"] = "GPKG" + alg3["overwrite"] = True + assert alg3.ParseRunAndFinalize(["data/mdim_zarr.vrt", str(tmpfile)]) + + +def test_gdalalg_mdim_get_refs_no_chunked_storage(tmp_vsimem): + """Array without natural block size declines cleanly (Stage B3 guard).""" + # The mdim.vrt fixture should contain (or be augmented to contain) a + # coordinate array or contiguous-storage variable that GetBlockSize + # returns 0 for. Adjust the array path once the fixture is finalised. + tmpfile = tmp_vsimem / "out.gpkg" + alg = get_mdim_get_refs_alg() + alg["array"] = "/my_variable_with_time_decreasing" + alg["output-format"] = "GPKG" + with pytest.raises(Exception, match="not chunk-enumerable"): + alg.ParseRunAndFinalize(["data/mdim.vrt", str(tmpfile)]) + + +@pytest.mark.require_driver("HDF5") +def test_gdalalg_mdim_get_refs_hdf5_partial_chunk(tmp_vsimem): + """HDF5 with non-even chunking: trailing partial chunk must appear. + + Uses the autotest HDFEOS fixture (dummy_HDFEOS_swath_chunked.h5), + which has dimensions [20, 30, 40] and chunks [3, 4, 6] -- all dimensions + non-even, so the chunk grid is [7, 8, 7] = 392 chunks total. + + The trailing all-partial corner chunk (6, 7, 6) is the Phase 0 Q1 + stressor: its byte size of 64 (versus ~143 for full chunks) confirms + ceil-division semantics. Both the offset (121797) and size (64) here + are values predicted by the Phase 0 evidence-log oracle and verified + against the running algorithm in earlier validation. + """ + tmpfile = tmp_vsimem / "out.gpkg" + alg = get_mdim_get_refs_alg() + alg["array"] = "/HDFEOS/SWATHS/MySwath/Data Fields/MyDataField" + alg["output-format"] = "GPKG" + assert alg.ParseRunAndFinalize( + ["../gdrivers/data/hdf5/dummy_HDFEOS_swath_chunked.h5", str(tmpfile)] + ) + + ds = gdal.OpenEx(str(tmpfile), gdal.OF_VECTOR) + layer = ds.GetLayer(0) + assert layer.GetFeatureCount() == 392 # 7 * 8 * 7 from ceil division + + # row (6, 7, 6) is a present, all-partial corner chunk with the specific offset/size. + layer.SetAttributeFilter("dim_0 = 6 AND dim_1 = 7 AND dim_2 = 6") + f = layer.GetNextFeature() + assert f is not None + assert f.GetField("present") == 1 + assert f.GetField("offset") == 121797 + assert f.GetField("size") == 64 + + +############################################################################### +# Zarr driver tests: exercise present (one-file-per-chunk) and absent (sparse) +# branches of the three-state classification. Both require a small synthetic +# fixture built with zarr-python; deferred to a later test pass if zarr-python +# is not available in the test environment. +############################################################################### + + +@pytest.mark.require_driver("ZARR") +def test_gdalalg_mdim_get_refs_zarr_present(tmp_vsimem): + """Native Zarr produces present rows with offset=0 (one file per chunk). + + Uses autotest fixture gdrivers/data/zarr/order_f_u2.zarr: 2D array [4, 4] + with chunks [2, 3], ceil grid [2, 2], all four chunks materialised. The + non-even chunking on dim_1 (4 / 3 = 2 with remainder) also exercises + ceil-division through the Zarr driver, complementing the HDF5 partial-chunk + test in case the per-driver implementations diverge. + + Verifies: every chunk reports present=1, offset=0 (Zarr one-file-per-chunk + pattern), with path ending in the Zarr chunk-key form (e.g. ".../0.0"). + """ + tmpfile = tmp_vsimem / "out.gpkg" + alg = get_mdim_get_refs_alg() + alg["array"] = "/order_f_u2" + alg["output-format"] = "GPKG" + assert alg.ParseRunAndFinalize( + ["../gdrivers/data/zarr/order_f_u2.zarr", str(tmpfile)] + ) + + ds = gdal.OpenEx(str(tmpfile), gdal.OF_VECTOR) + layer = ds.GetLayer(0) + assert layer.GetFeatureCount() == 4 # 2 * 2 from ceil([4/2, 4/3]) + + # All chunks must be present, all offsets zero (Zarr's one-file-per-chunk + # pattern), all paths must reference the chunk-key form ".../d0.d1". + layer.SetAttributeFilter("present = 1") + assert layer.GetFeatureCount() == 4 + layer.SetAttributeFilter("offset = 0") + assert layer.GetFeatureCount() == 4 + layer.SetAttributeFilter(None) # clear filter for enumeration check below + + # Enumeration cross-check: every (dim_0, dim_1) in the chunk grid must + # appear exactly once. This implicitly tests the row-major enumeration + # order of LinearToCoords (no chunks dropped, no chunks duplicated). + expected_coords = [(0, 0), (0, 1), (1, 0), (1, 1)] + for d0, d1 in expected_coords: + layer.SetAttributeFilter(f"dim_0 = {d0} AND dim_1 = {d1}") + assert ( + layer.GetFeatureCount() == 1 + ), f"chunk coordinate ({d0}, {d1}) missing from output" + + # Path-form check: pick one chunk and confirm the path ends with the + # expected Zarr chunk-key. The full path includes the fixture's location; + # we only assert on the suffix. + layer.SetAttributeFilter("dim_0 = 1 AND dim_1 = 1") + f = layer.GetNextFeature() + assert f is not None + assert f.GetField("path").endswith( + "/1.1" + ), f"unexpected path form: {f.GetField('path')}" + + +@pytest.mark.require_driver("ZARR") +def test_gdalalg_mdim_get_refs_zarr_sparse(tmp_path, tmp_vsimem): + """Zarr with a missing chunk produces an absent row (NULL path/offset/size). + + Copies the order_f_u2.zarr fixture into tmp_path, deletes chunk file 0.1 + to create a sparse case (Zarr semantics: missing chunk = fill-value + everywhere), and verifies that get-refs emits a present=0 row with NULL + path/offset/size for that coordinate while the other three chunks remain + present. + + This exercises the absent branch of the three-state classification; the + present branch is covered by test_gdalalg_mdim_get_refs_zarr_present. + """ + import os + + alg = get_mdim_convert_alg() + dst = str(tmp_path / "order_f_u2_sparse.zarr") + assert alg.ParseRunAndFinalize(["../gdrivers/data/zarr/order_f_u2.zarr", dst]) + + # Remove one chunk file. Picking 0.1 (a non-corner chunk) so the test + # exercises absence cleanly without conflating with the trailing-partial + # case at coordinate (1, 1). + os.remove(os.path.join(dst, "order_f_u2", "0.1")) + + tmpfile = tmp_vsimem / "out.gpkg" + alg = get_mdim_get_refs_alg() + alg["array"] = "/order_f_u2" + alg["output-format"] = "GPKG" + assert alg.ParseRunAndFinalize([dst, str(tmpfile)]) + + ds = gdal.OpenEx(str(tmpfile), gdal.OF_VECTOR) + layer = ds.GetLayer(0) + assert layer.GetFeatureCount() == 4 # the chunk grid is unchanged + + # The deleted chunk's row must be absent with NULL byte-reference fields. + layer.SetAttributeFilter("dim_0 = 0 AND dim_1 = 1") + f = layer.GetNextFeature() + assert f is not None + assert f.GetField("present") == 0 + assert f.IsFieldNull("path") + assert f.IsFieldNull("offset") + assert f.IsFieldNull("size") + + # The other three chunks must still be present. + layer.SetAttributeFilter("present = 1") + assert layer.GetFeatureCount() == 3 + + # And specifically, only chunk (0, 1) should be absent. + layer.SetAttributeFilter("present = 0") + assert layer.GetFeatureCount() == 1 + + +############################################################################### +# Parquet-specific tests +############################################################################### + + +@pytest.mark.require_driver("Parquet") +def test_gdalalg_mdim_get_refs_parquet_boolean_subtype(tmp_vsimem): + """OFSTBoolean subtype survives Parquet round-trip.""" + tmpfile = tmp_vsimem / "out.parquet" + alg = get_mdim_get_refs_alg() + alg["array"] = "/var" + alg["output-format"] = "Parquet" + assert alg.ParseRunAndFinalize(["data/mdim_zarr.vrt", str(tmpfile)]) + + ds = gdal.OpenEx(str(tmpfile), gdal.OF_VECTOR) + layer = ds.GetLayer(0) + defn = layer.GetLayerDefn() + idx = defn.GetFieldIndex("present") + assert idx >= 0 + assert defn.GetFieldDefn(idx).GetSubType() == ogr.OFSTBoolean diff --git a/doc/source/programs/gdal_mdim_get_refs.rst b/doc/source/programs/gdal_mdim_get_refs.rst new file mode 100644 index 000000000000..061d0001be61 --- /dev/null +++ b/doc/source/programs/gdal_mdim_get_refs.rst @@ -0,0 +1,190 @@ +.. _gdal_mdim_get_refs: + +================================================================================ +``gdal mdim get-refs`` +================================================================================ + +.. versionadded:: 3.14 + +.. program:: gdal mdim get-refs + +.. only:: html + + Extract per-chunk byte references from a multidimensional raster array + into a vector layer. + +.. Index:: gdal mdim get-refs + +Synopsis +-------- + +.. program-output:: gdal mdim get-refs --help-doc + +Description +----------- + +:program:`gdal mdim get-refs` enumerates the chunks of a multidimensional +array and writes one feature per chunk into a vector layer. Each feature +records the chunk's coordinates within the array's chunk grid, the location +of the chunk's backing storage (file path, byte offset, and byte size), and +any per-chunk metadata reported by the driver. + +The output is an attribute-only vector layer (no geometry). + +The algorithm is format-general: it relies on the multidimensional +``GetRawBlockInfo()``. + +Three-state classification +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Each chunk is classified as one of three states, reflected in the ``present`` +field of the output layer: + +* **Present** (``present=1``, ``path``/``offset``/``size`` populated): the + chunk is file-backed and can be read from the given byte range. +* **Absent** (``present=0``, ``path``/``offset``/``size`` NULL): the chunk + is not stored (sparse Zarr, missing chunks). Consumers should substitute + the array's fill value. +* **Inline** (``present=1``, ``path``/``offset`` NULL, ``size`` populated): + the chunk data is small enough to be embedded in the storage metadata + rather than referenced as bytes. (Not yet implemented.) + +Schema +------ + +The output layer's schema is determined by the input array's rank: + +.. list-table:: + :header-rows: 1 + + * - Field name + - Type + - Description + * - ``dim_0`` .. ``dim_{n-1}`` + - Integer64 + - Chunk coordinate per dimension of the input array, in row-major + order. The dimension *names* are not used as column names directly + (to avoid sanitization concerns); they are recorded in the layer + metadata as ``DIM_N_NAME``. + * - ``present`` + - Integer (Boolean subtype) + - 1 if the chunk has storage (file-backed or inline); 0 if absent. + * - ``path`` + - String, nullable + - For present chunks, the path to the backing file (including any + ``/vsicurl/`` or other VSI prefix). NULL for absent and inline + chunks. + * - ``offset`` + - Integer64, nullable + - For file-backed present chunks, the byte offset within the file + where the chunk's raw bytes start. For native Zarr (one file per + chunk), this is typically 0. NULL for absent and inline chunks. Note + that ``offset`` is a SQL reserved word and must be quoted in queries. + * - ``size`` + - Integer64, nullable + - For present chunks, the number of bytes of raw chunk storage. + NULL for absent chunks. + * - ``info`` + - String, nullable + - Per-chunk metadata reported by the driver, joined as ``KEY=VALUE`` + pairs separated by ``; ``. Typically describes compression and + byte-order. The same codec information is also hoisted to the layer + as metadata items ``CODEC_*``. + +Layer metadata +~~~~~~~~~~~~~~ + +The output layer carries the following metadata items, accessible via +:cpp:func:`GDALMajorObject::GetMetadata` (``-mdd`` in :program:`ogrinfo`): + +* ``ARRAY_NAME`` — the fullname of the input array +* ``DTYPE`` — the array's data type (e.g. ``Int16``, ``Float32``) +* ``DIM_N_NAME``, ``DIM_N_SIZE``, ``DIM_N_BLOCK``, ``DIM_N_CHUNKS`` — for + each dimension, the dimension name, size, block size, and number of + chunks along that axis +* ``CODEC_*`` — array-level codec metadata hoisted from the first chunk + (e.g. ``CODEC_COMPRESSION=DEFLATE``, ``CODEC_FILTER=SHUFFLE``) + +Limitations +----------- + +* Only a single array can be emitted (``--array`` is required). +* Arrays without natural block size decline with a ``not chunk-enumerable`` error. +* The inline data payload is not extracted as a binary field. +* No geometry column is emitted. +* ``GetRawBlockInfo()`` iterates chunks and this can be slow especially for remote sources. + +Options +------- + +.. include:: options/gdal_options/of_vector.rst + +.. option:: --array + + Required. Fullname of the multidimensional array within the input + dataset (e.g. ``/temp`` or ``/HDFEOS/SWATHS/MySwath/Data Fields/MyDataField``). + Use :program:`gdal mdim info` to list available arrays in an input + dataset. + +.. include:: options/gdal_options/oo.rst + +.. include:: options/gdal_options/if_multidim_raster.rst + +.. include:: options/gdal_options/co_vector.rst + +.. include:: options/gdal_options/overwrite.rst + + +Examples +-------- + +.. example:: + :title: Extract chunk references from a chunked HDF5 array to GeoPackage + + .. code-block:: bash + + gdal mdim get-refs \ + --array "/HDFEOS/SWATHS/MySwath/Data Fields/MyDataField" \ + input.h5 chunks.gpkg + +.. example:: + :title: Extract from a remote netCDF, output as Parquet + + .. code-block:: bash + + gdal mdim get-refs \ + --array /temp --of Parquet \ + /vsicurl/https://example.org/path/to/ocean.nc \ + ocean_chunks.parquet + +.. example:: + :title: Query the resulting Parquet for specific chunk coordinates + + The output participates in SQL pushdown. The ``offset`` column must be + quoted as it is a reserved word. + + .. code-block:: bash + + ogrinfo ocean_chunks.parquet -sql \ + 'SELECT "offset", size FROM ocean_chunks + WHERE dim_2 BETWEEN 600 AND 900 + AND dim_3 BETWEEN 1200 AND 1500' + +.. example:: + :title: Native Zarr (one file per chunk; offset is always 0) + + .. code-block:: bash + + gdal mdim get-refs \ + --array /adt --of GPKG \ + 'ZARR:"/vsicurl/https://example.org/store.zarr"' \ + adt_chunks.gpkg + +See Also +-------- + +* :ref:`gdal_mdim_info` — discover the arrays and chunk geometry of a + multidimensional dataset +* :ref:`gdal_mdim_convert` — copy or transform a multidimensional dataset +* :ref:`gdal_mdim_mosaic` — compose multiple multidimensional datasets into + one virtual view From a68f158a4de8ae89318a131163c6fc0109e29173 Mon Sep 17 00:00:00 2001 From: Michael Sumner Date: Thu, 28 May 2026 05:21:30 +0000 Subject: [PATCH 2/2] apply review changes (some) --- apps/gdalalg_mdim_get_refs.cpp | 175 ++++++------------ apps/gdalalg_mdim_get_refs_common.h | 41 ++-- .../utilities/test_gdalalg_mdim_get_refs.py | 100 ++-------- doc/source/programs/gdal_mdim_get_refs.rst | 24 ++- 4 files changed, 109 insertions(+), 231 deletions(-) diff --git a/apps/gdalalg_mdim_get_refs.cpp b/apps/gdalalg_mdim_get_refs.cpp index b9d8520e3e17..1ee2123cc23f 100644 --- a/apps/gdalalg_mdim_get_refs.cpp +++ b/apps/gdalalg_mdim_get_refs.cpp @@ -12,9 +12,11 @@ #include "gdalalg_mdim_get_refs.h" #include "cpl_conv.h" +#include "cpl_string.h" #include "gdal_priv.h" #include "ogrsf_frmts.h" #include "gdalalg_mdim_get_refs_common.h" +#include //! @cond Doxygen_Suppress @@ -22,27 +24,8 @@ #define _(x) (x) #endif -namespace -{ - -// Format a vector of integers as "[a, b, c]" for debug messages. -std::string FormatVec(const std::vector &v) -{ - std::string s = "["; - for (size_t i = 0; i < v.size(); ++i) - { - if (i > 0) - s += ", "; - s += std::to_string(v[i]); - } - s += "]"; - return s; -} - -} // namespace - /************************************************************************/ -/* GDALMdimGetRefsAlgorithm() */ +/* GDALMdimGetRefsAlgorithm() */ /************************************************************************/ GDALMdimGetRefsAlgorithm::GDALMdimGetRefsAlgorithm() @@ -52,8 +35,7 @@ GDALMdimGetRefsAlgorithm::GDALMdimGetRefsAlgorithm() AddOutputFormatArg(&m_outputFormat, /* bStreamAllowed = */ false, /* bGDALGAllowed = */ false) .AddMetadataItem(GAAMDI_REQUIRED_CAPABILITIES, - {GDAL_DCAP_VECTOR, GDAL_DCAP_CREATE}) - .SetRequired(); + {GDAL_DCAP_VECTOR, GDAL_DCAP_CREATE}); AddOpenOptionsArg(&m_openOptions); AddInputFormatsArg(&m_inputFormats) .AddMetadataItem(GAAMDI_REQUIRED_CAPABILITIES, @@ -72,22 +54,10 @@ GDALMdimGetRefsAlgorithm::GDALMdimGetRefsAlgorithm() bool GDALMdimGetRefsAlgorithm::RunImpl(GDALProgressFunc pfnProgress, void *pProgressData) { - // ---------------------------------------------------------------------- - // STAGE A — resolve the input array - // ---------------------------------------------------------------------- - // A1. Get the input GDALDataset from m_inputDataset (already opened by the - // framework as OF_MULTIDIM_RASTER — confirm footprint/convert rely on - // the framework open rather than re-opening). GetDatasetRef(). - auto poSrcDS = m_inputDataset.GetDatasetRef(); CPLAssert(poSrcDS); - // A2. GetRootGroup(). Null root => driver lacks mdim support => fail with a - // clear CPLError, return false. auto poRootGroup = poSrcDS->GetRootGroup(); - CPLDebug("MDIM-GET-REFS", "input: %s, root group: %s", - poSrcDS->GetDescription(), poRootGroup ? "present" : "NULL"); - if (!poRootGroup) { ReportError(CE_Failure, CPLE_AppDefined, @@ -107,21 +77,22 @@ bool GDALMdimGetRefsAlgorithm::RunImpl(GDALProgressFunc pfnProgress, return false; } - const std::vector> apoDims = + const std::vector> &apoDims = poArray->GetDimensions(); const auto anBlockSize = poArray->GetBlockSize(); std::vector anBlockSizeU64(anBlockSize.begin(), anBlockSize.end()); CPLAssert(anBlockSize.size() == poArray->GetDimensionCount()); - for (size_t i = 0; i < anBlockSizeU64.size(); ++i) + for (size_t i = 0; i < anBlockSize.size(); ++i) { if (anBlockSize[i] == 0) { - ReportError(CE_Failure, CPLE_AppDefined, - "Array %s has no natural block size on dimension %zu; " - "not chunk-enumerable", - m_array.c_str(), i); + ReportError( + CE_Failure, CPLE_AppDefined, + "Array %s has no natural block size on dimension %" PRIu64 ";" + "not chunk-enumerable", + m_array.c_str(), static_cast(i)); return false; } } @@ -143,20 +114,29 @@ bool GDALMdimGetRefsAlgorithm::RunImpl(GDALProgressFunc pfnProgress, for (size_t i = 0; i < apoDims.size(); ++i) anDimSize[i] = apoDims[i]->GetSize(); - // Stage C: chunk grid via the helper std::vector n_chunks; - const size_t nTotalChunks = + const uint64_t nTotalChunks = get_refs::ComputeChunkGrid(anDimSize, anBlockSizeU64, n_chunks); - CPLDebug( - "MDIM-GET-REFS", - "array %s: dims=[%s], blocks=[%s], chunks=[%s], total=%zu, dtype=%s", - m_array.c_str(), FormatVec(anDimSize).c_str(), - FormatVec(anBlockSizeU64).c_str(), FormatVec(n_chunks).c_str(), - nTotalChunks, dt_name); - const std::string osOutputPath = m_outputDataset.GetName(); + if (m_outputFormat.empty()) + { + const auto aosFormats = + CPLStringList(GDALGetOutputDriversForDatasetName( + m_outputDataset.GetName().c_str(), GDAL_OF_VECTOR, + /* bSingleMatch = */ true, + /* bWarn = */ true)); + if (aosFormats.size() != 1) + { + ReportError(CE_Failure, CPLE_AppDefined, + "Cannot guess driver for %s", + m_outputDataset.GetName().c_str()); + return false; + } + m_outputFormat = aosFormats[0]; + } + GDALDriver *poDriver = GetGDALDriverManager()->GetDriverByName(m_outputFormat.c_str()); if (!poDriver) @@ -190,10 +170,8 @@ bool GDALMdimGetRefsAlgorithm::RunImpl(GDALProgressFunc pfnProgress, osLayerName.c_str(), osOutputPath.c_str()); return false; } - - // D6. Per-dimension fields: dim_0, dim_1, ... as OFTInteger64 - // (names live in layer metadata, not field names — keeps the schema - // identical-shape across arrays, sidesteps sanitization). + // Per-dimension fields: dim_0, dim_1, ... with generic names + // for row-filter, rather than an encoded position "0.0.0.." for (size_t i = 0; i < apoDims.size(); ++i) { OGRFieldDefn oField(CPLSPrintf("dim_%zu", i), OFTInteger64); @@ -205,7 +183,6 @@ bool GDALMdimGetRefsAlgorithm::RunImpl(GDALProgressFunc pfnProgress, } } - // D7. Generic fields per RFC Stage 1 schema. OGRFieldDefn oPresentField("present", OFTInteger); oPresentField.SetSubType(OFSTBoolean); if (poLayer->CreateField(&oPresentField) != OGRERR_NONE) @@ -216,7 +193,7 @@ bool GDALMdimGetRefsAlgorithm::RunImpl(GDALProgressFunc pfnProgress, } OGRFieldDefn oPathField("path", OFTString); - oPathField.SetNullable(TRUE); + if (poLayer->CreateField(&oPathField) != OGRERR_NONE) { ReportError(CE_Failure, CPLE_AppDefined, "Cannot create field 'path'"); @@ -224,7 +201,7 @@ bool GDALMdimGetRefsAlgorithm::RunImpl(GDALProgressFunc pfnProgress, } OGRFieldDefn oOffsetField("offset", OFTInteger64); - oOffsetField.SetNullable(TRUE); + if (poLayer->CreateField(&oOffsetField) != OGRERR_NONE) { ReportError(CE_Failure, CPLE_AppDefined, @@ -233,7 +210,6 @@ bool GDALMdimGetRefsAlgorithm::RunImpl(GDALProgressFunc pfnProgress, } OGRFieldDefn oSizeField("size", OFTInteger64); - oSizeField.SetNullable(TRUE); if (poLayer->CreateField(&oSizeField) != OGRERR_NONE) { ReportError(CE_Failure, CPLE_AppDefined, "Cannot create field 'size'"); @@ -241,16 +217,14 @@ bool GDALMdimGetRefsAlgorithm::RunImpl(GDALProgressFunc pfnProgress, } OGRFieldDefn oInfoField("info", OFTString); - oInfoField.SetNullable(TRUE); if (poLayer->CreateField(&oInfoField) != OGRERR_NONE) { ReportError(CE_Failure, CPLE_AppDefined, "Cannot create field 'info'"); return false; } - // D8. Array-level metadata on the layer. poLayer->SetMetadataItem("ARRAY_NAME", m_array.c_str()); - poLayer->SetMetadataItem("DTYPE", dt_name); + poLayer->SetMetadataItem("DATA_TYPE", dt_name); for (size_t i = 0; i < apoDims.size(); ++i) { poLayer->SetMetadataItem(CPLSPrintf("DIM_%zu_NAME", i), @@ -266,18 +240,12 @@ bool GDALMdimGetRefsAlgorithm::RunImpl(GDALProgressFunc pfnProgress, CPLSPrintf("%zu", n_chunks[i])); } - CPLDebug("MDIM-GET-REFS", - "created layer '%s' with %d fields, ready for %zu features", - osLayerName.c_str(), poLayer->GetLayerDefn()->GetFieldCount(), - nTotalChunks); - - std::vector coords( - apoDims.size()); // reused, inside LinearToCoords + std::vector coords(apoDims.size()); GDALMDArrayRawBlockInfo info; // reused, .clear() per iteration // Loop over chunks const size_t nProgressInterval = std::max(1, nTotalChunks / 100); - bool bCodecHoisted = false; + bool bCodecToLayerMetadata = false; for (size_t iLinear = 0; iLinear < nTotalChunks; ++iLinear) { info.clear(); @@ -288,94 +256,69 @@ bool GDALMdimGetRefsAlgorithm::RunImpl(GDALProgressFunc pfnProgress, "GetRawBlockInfo failed at linear index %zu", iLinear); return false; } - if (iLinear < 3 || iLinear == nTotalChunks - 1) - { - CPLDebug("MDIM-GET-REFS", - "chunk %zu coords=[%s] path=%s offset=" CPL_FRMT_GUIB - " size=" CPL_FRMT_GUIB, - iLinear, FormatVec(coords).c_str(), - info.pszFilename ? info.pszFilename : "(null)", - static_cast(info.nOffset), - static_cast(info.nSize)); - } - OGRFeature *poFeature = - OGRFeature::CreateFeature(poLayer->GetLayerDefn()); - + OGRFeature oFeature(poLayer->GetLayerDefn()); // Per-dim coordinates: dim_0 .. dim_{n-1} for (size_t i = 0; i < coords.size(); ++i) - poFeature->SetField(static_cast(i), - static_cast(coords[i])); - - // Three-state classification — exactly the Commit 1 reconciliation + oFeature.SetField(static_cast(i), + static_cast(coords[i])); const int iPresentField = static_cast(coords.size()); const int iPathField = iPresentField + 1; const int iOffsetField = iPresentField + 2; const int iSizeField = iPresentField + 3; const int iInfoField = iPresentField + 4; + // Three-state classification if (info.pszFilename != nullptr) { // present (file-backed) - poFeature->SetField(iPresentField, 1); - poFeature->SetField(iPathField, info.pszFilename); - poFeature->SetField(iOffsetField, - static_cast(info.nOffset)); - poFeature->SetField(iSizeField, static_cast(info.nSize)); + oFeature.SetField(iPresentField, 1); + oFeature.SetField(iPathField, info.pszFilename); + oFeature.SetField(iOffsetField, static_cast(info.nOffset)); + oFeature.SetField(iSizeField, static_cast(info.nSize)); } else if (info.pabyInlineData != nullptr) { - // inline — Stage 1 reports size but not bytes - // (note: classify by pabyInlineData, not nSize > 0 — Commit 1) - poFeature->SetField(iPresentField, 1); - // path and offset left null (default state) - poFeature->SetField(iSizeField, static_cast(info.nSize)); + // path and offset left null, and we don't yet inline data even though it's available + oFeature.SetField(iPresentField, 1); + oFeature.SetField(iSizeField, static_cast(info.nSize)); } else { // absent (sparse) - poFeature->SetField(iPresentField, 0); - // path, offset, size all left null + oFeature.SetField(iPresentField, 0); } - - // info (papszInfo joined) — applies to all three states when non-null if (info.papszInfo != nullptr) { - CPLString osJoined; + std::string osJoined; for (int i = 0; info.papszInfo[i] != nullptr; ++i) { if (i > 0) osJoined += "; "; osJoined += info.papszInfo[i]; } - poFeature->SetField(iInfoField, osJoined.c_str()); + oFeature.SetField(iInfoField, osJoined.c_str()); } - // Codec hoist to layer metadata on the first successful file-backed chunk. - if (!bCodecHoisted && info.papszInfo != nullptr && + // Codec layer metadata on the first successful file-backed chunk. + if (!bCodecToLayerMetadata && info.papszInfo != nullptr && info.pszFilename != nullptr) { - for (int i = 0; info.papszInfo[i] != nullptr; ++i) + const CPLStringList aosInfo(info.papszInfo, /* bAssign = */ false); + for (const auto &[pszKey, pszValue] : + cpl::IterateNameValue(aosInfo)) { - char *pszKey = nullptr; - const char *pszValue = - CPLParseNameValue(info.papszInfo[i], &pszKey); - if (pszKey && pszValue) - poLayer->SetMetadataItem( - CPLString().Printf("CODEC_%s", pszKey).c_str(), - pszValue); - CPLFree(pszKey); + poLayer->SetMetadataItem( + CPLString().Printf("CODEC_%s", pszKey).c_str(), pszValue); } - bCodecHoisted = true; + bCodecToLayerMetadata = true; } - if (poLayer->CreateFeature(poFeature) != OGRERR_NONE) + if (poLayer->CreateFeature(&oFeature) != OGRERR_NONE) { ReportError(CE_Failure, CPLE_AppDefined, "Cannot write feature for chunk %zu", iLinear); - OGRFeature::DestroyFeature(poFeature); return false; } - OGRFeature::DestroyFeature(poFeature); // progress if (pfnProgress && (iLinear % nProgressInterval == 0)) diff --git a/apps/gdalalg_mdim_get_refs_common.h b/apps/gdalalg_mdim_get_refs_common.h index da7df7730a5b..5c9b1a282f7a 100644 --- a/apps/gdalalg_mdim_get_refs_common.h +++ b/apps/gdalalg_mdim_get_refs_common.h @@ -24,7 +24,7 @@ namespace get_refs { /************************************************************************/ -/* LinearToCoords() */ +/* LinearToCoords() */ /************************************************************************/ /** * Decode a linear chunk index into per-dimension chunk coordinates, @@ -36,7 +36,7 @@ namespace get_refs * iLinear = (((c_0 * N_1) + c_1) * N_2 + c_2) * ... + c_{k-1}. * * The output is uint64_t to match GDALMDArray::GetRawBlockInfo()'s - * coordinate-pointer parameter directly — no further conversion needed + * coordinate-pointer parameter directly - no further conversion needed * at the call site. * * @param iLinear The flat chunk index. Must be < product(n_chunks); @@ -46,20 +46,22 @@ namespace get_refs * no zero entries. * @param coords Output buffer; checked to match size of n_chunks, and filled. */ -inline void LinearToCoords(size_t iLinear, const std::vector &n_chunks, +inline void LinearToCoords(uint64_t iLinear, + const std::vector &n_chunks, std::vector &coords) { CPLAssert(coords.size() == n_chunks.size()); - size_t remaining = iLinear; - for (size_t iDim = n_chunks.size(); iDim-- > 0;) + uint64_t remaining = iLinear; + for (uint64_t iDim = n_chunks.size(); iDim > 0; /* decremented in loop */) { + --iDim; coords[iDim] = static_cast(remaining % n_chunks[iDim]); remaining /= n_chunks[iDim]; } } /************************************************************************/ -/* CoordsToLinear() */ +/* CoordsToLinear() */ /************************************************************************/ /** * Encode per-dimension chunk coordinates into a linear chunk index, @@ -73,14 +75,14 @@ inline void LinearToCoords(size_t iLinear, const std::vector &n_chunks, * @param n_chunks Per-dimension chunk count. Same size as coords. * @return The linear index in [0, product(n_chunks)). */ -inline size_t CoordsToLinear(const std::vector &coords, - const std::vector &n_chunks) +inline uint64_t CoordsToLinear(const std::vector &coords, + const std::vector &n_chunks) { CPLAssert(coords.size() == n_chunks.size()); - size_t iLinear = 0; - for (size_t iDim = 0; iDim < n_chunks.size(); ++iDim) + uint64_t iLinear = 0; + for (uint64_t iDim = 0; iDim < n_chunks.size(); ++iDim) { - iLinear = iLinear * n_chunks[iDim] + static_cast(coords[iDim]); + iLinear = iLinear * n_chunks[iDim] + coords[iDim]; } return iLinear; } @@ -92,27 +94,26 @@ inline size_t CoordsToLinear(const std::vector &coords, * Compute per-dimension chunk counts by ceil-division: * n_chunks[i] = (dim_size[i] + block[i] - 1) / block[i]. * - * Confirmed correct against ZARR, netCDF, and HDF5 drivers (Phase 0 - * evidence log Q1; HDFEOS addendum). Caller must have already verified - * that all block[i] > 0; this function will divide by zero otherwise. + * Confirmed correct against ZARR, netCDF, and HDF5 drivers. Caller must + * have already verified that all block[i] > 0; this function will divide + * by zero otherwise. * * @param dim_size Per-dimension array size. * @param block Per-dimension block size. block.size() == dim_size.size(). * @param n_chunks Output; resized to dim_size.size() and filled. * @return The total chunk count, product(n_chunks). */ -inline size_t ComputeChunkGrid(const std::vector &dim_size, - const std::vector &block, - std::vector &n_chunks) +inline uint64_t ComputeChunkGrid(const std::vector &dim_size, + const std::vector &block, + std::vector &n_chunks) { CPLAssert(dim_size.size() == block.size()); n_chunks.resize(dim_size.size()); - size_t nTotal = 1; + uint64_t nTotal = 1; for (size_t i = 0; i < dim_size.size(); ++i) { CPLAssert(block[i] > 0); - n_chunks[i] = - static_cast((dim_size[i] + block[i] - 1) / block[i]); + n_chunks[i] = cpl::div_round_up(dim_size[i], block[i]); nTotal *= n_chunks[i]; } return nTotal; diff --git a/autotest/utilities/test_gdalalg_mdim_get_refs.py b/autotest/utilities/test_gdalalg_mdim_get_refs.py index 16616bd0a5b8..3f7366de6c44 100644 --- a/autotest/utilities/test_gdalalg_mdim_get_refs.py +++ b/autotest/utilities/test_gdalalg_mdim_get_refs.py @@ -12,25 +12,15 @@ ############################################################################### import pytest -import test_cli_utilities from osgeo import gdal, ogr pytestmark = [ pytest.mark.require_driver("VRT"), pytest.mark.require_driver("GPKG"), - pytest.mark.skipif( - test_cli_utilities.get_gdal_path() is None, - reason="gdal binary not available", - ), ] -@pytest.fixture() -def gdal_path(): - return test_cli_utilities.get_gdal_path() - - def get_mdim_get_refs_alg(): return gdal.GetGlobalAlgorithmRegistry()["mdim"]["get-refs"] @@ -43,6 +33,7 @@ def get_mdim_convert_alg(): # Algorithm-level tests, arguments, errors, framework behaviour. ############################################################################### + # Helper to extract type+subtype for a named field def field_info(name, defn): i = defn.GetFieldIndex(name) @@ -64,27 +55,27 @@ def test_gdalalg_mdim_get_refs_basic(tmp_vsimem): assert layer is not None defn = layer.GetLayerDefn() - # dim_0, dim_1 (assuming a 2D array in the VRT) — Integer64, no subtype + # dim_0, dim_1 (assuming a 2D array in the VRT) - Integer64, no subtype assert field_info("dim_0", defn) == (ogr.OFTInteger64, ogr.OFSTNone) assert field_info("dim_1", defn) == (ogr.OFTInteger64, ogr.OFSTNone) - # present — Integer with Boolean subtype + # present - Integer with Boolean subtype assert field_info("present", defn) == (ogr.OFTInteger, ogr.OFSTBoolean) - # path, info — plain String, no subtype + # path, info - plain String, no subtype assert field_info("path", defn) == (ogr.OFTString, ogr.OFSTNone) assert field_info("info", defn) == (ogr.OFTString, ogr.OFSTNone) - # offset, size — Integer64, no subtype + # offset, size - Integer64, no subtype assert field_info("offset", defn) == (ogr.OFTInteger64, ogr.OFSTNone) assert field_info("size", defn) == (ogr.OFTInteger64, ogr.OFSTNone) -def test_gdalalg_mdim_get_refs_array_required(): +def test_gdalalg_mdim_get_refs_array_required(tmp_vsimem): """--array is required at parse time; absence is rejected by the framework.""" alg = get_mdim_get_refs_alg() with pytest.raises(Exception, match="array"): - alg.ParseRunAndFinalize(["data/mdim_zarr.vrt", "/tmp/dummy.gpkg"]) + alg.ParseRunAndFinalize(["data/mdim_zarr.vrt", tmp_vsimem / "dummy.gpkg"]) def test_gdalalg_mdim_get_refs_missing_array_path(tmp_vsimem): @@ -97,18 +88,16 @@ def test_gdalalg_mdim_get_refs_missing_array_path(tmp_vsimem): alg.ParseRunAndFinalize(["data/mdim_zarr.vrt", str(tmpfile)]) -def test_gdalalg_mdim_get_refs_unknown_output_format(tmp_vsimem): +def test_gdalalg_mdim_get_refs_unknown_output_format(): """Unknown output driver must produce the helpful error wording.""" alg = get_mdim_get_refs_alg() - alg["array"] = "/var" with pytest.raises(RuntimeError, match="does not exist"): alg["output-format"] = "NOT_A_DRIVER" -def test_gdalalg_mdim_get_refs_non_vector_output_format(tmp_vsimem): +def test_gdalalg_mdim_get_refs_non_vector_output_format(): """A raster driver passed as --of must fail (capability filter rejects).""" alg = get_mdim_get_refs_alg() - alg["array"] = "/var" with pytest.raises(RuntimeError, match="does not expose the required"): alg["output-format"] = "GTiff" @@ -138,10 +127,7 @@ def test_gdalalg_mdim_get_refs_overwrite(tmp_vsimem): def test_gdalalg_mdim_get_refs_no_chunked_storage(tmp_vsimem): - """Array without natural block size declines cleanly (Stage B3 guard).""" - # The mdim.vrt fixture should contain (or be augmented to contain) a - # coordinate array or contiguous-storage variable that GetBlockSize - # returns 0 for. Adjust the array path once the fixture is finalised. + """Array without natural block size declines cleanly.""" tmpfile = tmp_vsimem / "out.gpkg" alg = get_mdim_get_refs_alg() alg["array"] = "/my_variable_with_time_decreasing" @@ -152,18 +138,7 @@ def test_gdalalg_mdim_get_refs_no_chunked_storage(tmp_vsimem): @pytest.mark.require_driver("HDF5") def test_gdalalg_mdim_get_refs_hdf5_partial_chunk(tmp_vsimem): - """HDF5 with non-even chunking: trailing partial chunk must appear. - - Uses the autotest HDFEOS fixture (dummy_HDFEOS_swath_chunked.h5), - which has dimensions [20, 30, 40] and chunks [3, 4, 6] -- all dimensions - non-even, so the chunk grid is [7, 8, 7] = 392 chunks total. - - The trailing all-partial corner chunk (6, 7, 6) is the Phase 0 Q1 - stressor: its byte size of 64 (versus ~143 for full chunks) confirms - ceil-division semantics. Both the offset (121797) and size (64) here - are values predicted by the Phase 0 evidence-log oracle and verified - against the running algorithm in earlier validation. - """ + """HDF5 with non-even chunking.""" tmpfile = tmp_vsimem / "out.gpkg" alg = get_mdim_get_refs_alg() alg["array"] = "/HDFEOS/SWATHS/MySwath/Data Fields/MyDataField" @@ -195,17 +170,7 @@ def test_gdalalg_mdim_get_refs_hdf5_partial_chunk(tmp_vsimem): @pytest.mark.require_driver("ZARR") def test_gdalalg_mdim_get_refs_zarr_present(tmp_vsimem): - """Native Zarr produces present rows with offset=0 (one file per chunk). - - Uses autotest fixture gdrivers/data/zarr/order_f_u2.zarr: 2D array [4, 4] - with chunks [2, 3], ceil grid [2, 2], all four chunks materialised. The - non-even chunking on dim_1 (4 / 3 = 2 with remainder) also exercises - ceil-division through the Zarr driver, complementing the HDF5 partial-chunk - test in case the per-driver implementations diverge. - - Verifies: every chunk reports present=1, offset=0 (Zarr one-file-per-chunk - pattern), with path ending in the Zarr chunk-key form (e.g. ".../0.0"). - """ + """Native Zarr produces present rows with offset=0 (one file per chunk).""" tmpfile = tmp_vsimem / "out.gpkg" alg = get_mdim_get_refs_alg() alg["array"] = "/order_f_u2" @@ -226,9 +191,8 @@ def test_gdalalg_mdim_get_refs_zarr_present(tmp_vsimem): assert layer.GetFeatureCount() == 4 layer.SetAttributeFilter(None) # clear filter for enumeration check below - # Enumeration cross-check: every (dim_0, dim_1) in the chunk grid must - # appear exactly once. This implicitly tests the row-major enumeration - # order of LinearToCoords (no chunks dropped, no chunks duplicated). + # Enumeration check: every (dim_0, dim_1) in the chunk grid must + # appear exactly once, no chunks dropped or duplicated. expected_coords = [(0, 0), (0, 1), (1, 0), (1, 1)] for d0, d1 in expected_coords: layer.SetAttributeFilter(f"dim_0 = {d0} AND dim_1 = {d1}") @@ -236,9 +200,8 @@ def test_gdalalg_mdim_get_refs_zarr_present(tmp_vsimem): layer.GetFeatureCount() == 1 ), f"chunk coordinate ({d0}, {d1}) missing from output" - # Path-form check: pick one chunk and confirm the path ends with the - # expected Zarr chunk-key. The full path includes the fixture's location; - # we only assert on the suffix. + # Path-form check: confirm the path ends with the chunk-key. The full + # path includes the fixture's location we only assert on the suffix. layer.SetAttributeFilter("dim_0 = 1 AND dim_1 = 1") f = layer.GetNextFeature() assert f is not None @@ -252,13 +215,9 @@ def test_gdalalg_mdim_get_refs_zarr_sparse(tmp_path, tmp_vsimem): """Zarr with a missing chunk produces an absent row (NULL path/offset/size). Copies the order_f_u2.zarr fixture into tmp_path, deletes chunk file 0.1 - to create a sparse case (Zarr semantics: missing chunk = fill-value - everywhere), and verifies that get-refs emits a present=0 row with NULL + to create a sparse case and verifies that get-refs emits a present=0 row with NULL path/offset/size for that coordinate while the other three chunks remain present. - - This exercises the absent branch of the three-state classification; the - present branch is covered by test_gdalalg_mdim_get_refs_zarr_present. """ import os @@ -267,8 +226,7 @@ def test_gdalalg_mdim_get_refs_zarr_sparse(tmp_path, tmp_vsimem): assert alg.ParseRunAndFinalize(["../gdrivers/data/zarr/order_f_u2.zarr", dst]) # Remove one chunk file. Picking 0.1 (a non-corner chunk) so the test - # exercises absence cleanly without conflating with the trailing-partial - # case at coordinate (1, 1). + # exercises absence. os.remove(os.path.join(dst, "order_f_u2", "0.1")) tmpfile = tmp_vsimem / "out.gpkg" @@ -297,25 +255,3 @@ def test_gdalalg_mdim_get_refs_zarr_sparse(tmp_path, tmp_vsimem): # And specifically, only chunk (0, 1) should be absent. layer.SetAttributeFilter("present = 0") assert layer.GetFeatureCount() == 1 - - -############################################################################### -# Parquet-specific tests -############################################################################### - - -@pytest.mark.require_driver("Parquet") -def test_gdalalg_mdim_get_refs_parquet_boolean_subtype(tmp_vsimem): - """OFSTBoolean subtype survives Parquet round-trip.""" - tmpfile = tmp_vsimem / "out.parquet" - alg = get_mdim_get_refs_alg() - alg["array"] = "/var" - alg["output-format"] = "Parquet" - assert alg.ParseRunAndFinalize(["data/mdim_zarr.vrt", str(tmpfile)]) - - ds = gdal.OpenEx(str(tmpfile), gdal.OF_VECTOR) - layer = ds.GetLayer(0) - defn = layer.GetLayerDefn() - idx = defn.GetFieldIndex("present") - assert idx >= 0 - assert defn.GetFieldDefn(idx).GetSubType() == ogr.OFSTBoolean diff --git a/doc/source/programs/gdal_mdim_get_refs.rst b/doc/source/programs/gdal_mdim_get_refs.rst index 061d0001be61..82110b718172 100644 --- a/doc/source/programs/gdal_mdim_get_refs.rst +++ b/doc/source/programs/gdal_mdim_get_refs.rst @@ -32,7 +32,7 @@ any per-chunk metadata reported by the driver. The output is an attribute-only vector layer (no geometry). The algorithm is format-general: it relies on the multidimensional -``GetRawBlockInfo()``. +:cpp:func:`GDALMDArray::GetRawBlockInfo`. Three-state classification ~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -88,30 +88,28 @@ The output layer's schema is determined by the input array's rank: - String, nullable - Per-chunk metadata reported by the driver, joined as ``KEY=VALUE`` pairs separated by ``; ``. Typically describes compression and - byte-order. The same codec information is also hoisted to the layer + byte-order. The same codec information is also copied to the layer as metadata items ``CODEC_*``. Layer metadata ~~~~~~~~~~~~~~ The output layer carries the following metadata items, accessible via -:cpp:func:`GDALMajorObject::GetMetadata` (``-mdd`` in :program:`ogrinfo`): +:cpp:func:`GDALMajorObject::GetMetadata`: -* ``ARRAY_NAME`` — the fullname of the input array -* ``DTYPE`` — the array's data type (e.g. ``Int16``, ``Float32``) -* ``DIM_N_NAME``, ``DIM_N_SIZE``, ``DIM_N_BLOCK``, ``DIM_N_CHUNKS`` — for +* ``ARRAY_NAME`` - the fullname of the input array +* ``DATA_TYPE`` - the array's data type (e.g. ``Int16``, ``Float32``) +* ``DIM_N_NAME``, ``DIM_N_SIZE``, ``DIM_N_BLOCK``, ``DIM_N_CHUNKS`` - for each dimension, the dimension name, size, block size, and number of chunks along that axis -* ``CODEC_*`` — array-level codec metadata hoisted from the first chunk +* ``CODEC_*`` - array-level codec metadata copied from the first chunk (e.g. ``CODEC_COMPRESSION=DEFLATE``, ``CODEC_FILTER=SHUFFLE``) Limitations ----------- * Only a single array can be emitted (``--array`` is required). -* Arrays without natural block size decline with a ``not chunk-enumerable`` error. * The inline data payload is not extracted as a binary field. -* No geometry column is emitted. * ``GetRawBlockInfo()`` iterates chunks and this can be slow especially for remote sources. Options @@ -165,7 +163,7 @@ Examples .. code-block:: bash - ogrinfo ocean_chunks.parquet -sql \ + gdal vector info ocean_chunks.parquet --features -sql \ 'SELECT "offset", size FROM ocean_chunks WHERE dim_2 BETWEEN 600 AND 900 AND dim_3 BETWEEN 1200 AND 1500' @@ -183,8 +181,8 @@ Examples See Also -------- -* :ref:`gdal_mdim_info` — discover the arrays and chunk geometry of a +* :ref:`gdal_mdim_info` - discover the arrays and chunk geometry of a multidimensional dataset -* :ref:`gdal_mdim_convert` — copy or transform a multidimensional dataset -* :ref:`gdal_mdim_mosaic` — compose multiple multidimensional datasets into +* :ref:`gdal_mdim_convert` - copy or transform a multidimensional dataset +* :ref:`gdal_mdim_mosaic` - compose multiple multidimensional datasets into one virtual view