diff --git a/pyproject.toml b/pyproject.toml index 9108b15a..898c7da6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -101,6 +101,7 @@ upstream = [ 's3fs @ git+https://github.com/fsspec/s3fs', 'kerchunk @ git+https://github.com/fsspec/kerchunk', 'icechunk @ git+https://github.com/earth-mover/icechunk#subdirectory=icechunk-python', + 'virtual_tiff @ git+https://github.com/virtual-zarr/virtual-tiff', ] docs = [ "mkdocs-material[imaging]>=9.6.14", diff --git a/virtualizarr/__init__.py b/virtualizarr/__init__.py index 34264206..8ffc6e71 100644 --- a/virtualizarr/__init__.py +++ b/virtualizarr/__init__.py @@ -4,7 +4,11 @@ VirtualiZarrDatasetAccessor, VirtualiZarrDataTreeAccessor, ) -from virtualizarr.xarray import open_virtual_dataset, open_virtual_mfdataset +from virtualizarr.xarray import ( + open_virtual_dataset, + open_virtual_datatree, + open_virtual_mfdataset, +) try: __version__ = _version("virtualizarr") @@ -18,4 +22,5 @@ "VirtualiZarrDataTreeAccessor", "open_virtual_dataset", "open_virtual_mfdataset", + "open_virtual_datatree", ] diff --git a/virtualizarr/manifests/group.py b/virtualizarr/manifests/group.py index 7f637fcf..d226e844 100644 --- a/virtualizarr/manifests/group.py +++ b/virtualizarr/manifests/group.py @@ -125,3 +125,37 @@ def to_virtual_dataset(self) -> xr.Dataset: coord_names=coord_names, attrs=attributes, ) + + def to_virtual_datasets(self) -> dict[str, xr.Dataset]: + """ + Create a dictionary containing virtual datasets for all the sub-groups of a ManifestGroup. All the + variables in the datasets will be "virtual", i.e., they will wrap ManifestArray objects. + + It is convenient to have a separate `to_virtual_datasets` function from `to_virtual_datatree` so that + it can be called recursively without needing to use `DataTree.to_dict() and `.from_dict()` repeatedly. + """ + result = {"": self.to_virtual_dataset()} + + # Recursively process all subgroups + for group_name, subgroup in self.groups.items(): + subgroup_datasets = subgroup.to_virtual_datasets() + + # Add the subgroup's datasets with proper path prefixes + for subpath, dataset in subgroup_datasets.items(): + if subpath == "": + # Direct child group + full_path = group_name + else: + # Nested subgroup + full_path = f"{group_name}/{subpath}" + result[full_path] = dataset + return result + + def to_virtual_datatree(self) -> xr.DataTree: + """ + Create a "virtual" [xarray.DataTree][] containing the contents of one zarr group. + + All variables in the returned DataTree will be "virtual", i.e. they will wrap ManifestArray objects. + """ + datasets = self.to_virtual_datasets() + return xr.DataTree.from_dict(datasets) diff --git a/virtualizarr/manifests/store.py b/virtualizarr/manifests/store.py index ba509857..f484a97a 100644 --- a/virtualizarr/manifests/store.py +++ b/virtualizarr/manifests/store.py @@ -327,6 +327,46 @@ def to_virtual_dataset( decode_times=decode_times, ) + def to_virtual_datatree( + self, + group="", + *, + drop_variables: Iterable[str] | None = None, + loadable_variables: Iterable[str] | None = None, + decode_times: bool | None = None, + ) -> "xr.DataTree": + """ + Create a "virtual" [xarray.DataTree][] containing the contents of a zarr group. Default is the root group and all sub-groups. + + Will ignore the contents of any other groups in the store. + + Requires xarray. + + Parameters + ---------- + group : Group to convert to a virtual DataTree + drop_variables + Variables in the data source to drop before returning. + loadable_variables + Variables in the data source to load as Dask/NumPy arrays instead of as virtual arrays. + decode_times + Bool that is passed into [xarray.open_dataset][]. Allows time to be decoded into a datetime object. + + Returns + ------- + vdt : xarray.DataTree + """ + + from virtualizarr.xarray import construct_virtual_datatree + + return construct_virtual_datatree( + manifest_store=self, + group=group, + loadable_variables=loadable_variables, + decode_times=decode_times, + drop_variables=drop_variables, + ) + def _transform_byte_range( byte_range: ByteRequest | None, *, chunk_start: int, chunk_end_exclusive: int diff --git a/virtualizarr/tests/test_parsers/test_tiff.py b/virtualizarr/tests/test_parsers/test_tiff.py index 38ea85a0..02397c0c 100644 --- a/virtualizarr/tests/test_parsers/test_tiff.py +++ b/virtualizarr/tests/test_parsers/test_tiff.py @@ -1,8 +1,8 @@ import pytest from obstore.store import S3Store -from xarray import Dataset +from xarray import Dataset, DataTree -from virtualizarr import open_virtual_dataset +from virtualizarr import open_virtual_dataset, open_virtual_datatree from virtualizarr.registry import ObjectStoreRegistry from virtualizarr.tests import requires_network, requires_tiff @@ -11,7 +11,25 @@ @requires_tiff @requires_network -def test_virtual_tiff() -> None: +def test_virtual_tiff_datatree() -> None: + store = S3Store("sentinel-cogs", region="us-west-2", skip_signature=True) + registry = ObjectStoreRegistry({"s3://sentinel-cogs/": store}) + url = "s3://sentinel-cogs/sentinel-s2-l2a-cogs/12/S/UF/2022/6/S2B_12SUF_20220609_0_L2A/B04.tif" + parser = virtual_tiff.VirtualTIFF(ifd_layout="nested") + with open_virtual_datatree(url=url, parser=parser, registry=registry) as vdt: + assert isinstance(vdt, DataTree) + assert list(vdt["0"].ds.variables) == ["0"] + var = vdt["0"].ds["0"].variable + assert var.sizes == {"y": 10980, "x": 10980} + assert var.dtype == " None: store = S3Store("sentinel-cogs", region="us-west-2", skip_signature=True) registry = ObjectStoreRegistry({"s3://sentinel-cogs/": store}) url = "s3://sentinel-cogs/sentinel-s2-l2a-cogs/12/S/UF/2022/6/S2B_12SUF_20220609_0_L2A/B04.tif" diff --git a/virtualizarr/xarray.py b/virtualizarr/xarray.py index bcd63b20..e868589e 100644 --- a/virtualizarr/xarray.py +++ b/virtualizarr/xarray.py @@ -21,7 +21,7 @@ from xarray.core.types import NestedSequence from xarray.structure.combine import _infer_concat_order_from_positions, _nested_combine -from virtualizarr.manifests import ManifestStore +from virtualizarr.manifests import ManifestArray, ManifestGroup, ManifestStore from virtualizarr.manifests.manifest import validate_and_normalize_path_to_uri from virtualizarr.parallel import get_executor from virtualizarr.parsers.typing import Parser @@ -35,6 +35,68 @@ ) +def open_virtual_datatree( + url: str, + registry: ObjectStoreRegistry, + parser: Parser, + *, + drop_variables: Iterable[str] | None = None, + loadable_variables: Iterable[str] | None = None, + decode_times: bool | None = None, +) -> xr.DataTree: + """ + Open an archival data source as an [xarray.Datatree][] wrapping virtualized zarr arrays. + + No data variables will be loaded unless specified in the ``loadable_variables`` kwarg (in which case they will open as lazily indexed arrays using xarray's standard lazy indexing classes). + + Xarray indexes can optionally be created (the default behaviour is to create indexes for any 1D coordinate variables). To avoid creating any xarray indexes pass ``indexes={}``. + + Parameters + ---------- + url + The url of the data source to virtualize. The URL should include a scheme. For example: + + - `url="file:///Users/my-name/Documents/my-project/my-data.nc"` for a local data source. + - `url="s3://my-bucket/my-project/my-data.nc"` for a remote data source on an S3 compatible cloud. + + registry + An [ObjectStoreRegistry][virtualizarr.registry.ObjectStoreRegistry] for resolving urls and reading data. + parser + A parser to use for the given data source. For example: + + - [virtualizarr.parsers.HDFParser][] for virtualizing NetCDF4 or HDF5 files. + - [virtualizarr.parsers.FITSParser][] for virtualizing FITS files. + - [virtualizarr.parsers.NetCDF3Parser][] for virtualizing NetCDF3 files. + - [virtualizarr.parsers.KerchunkJSONParser][] for re-opening Kerchunk JSONs. + - [virtualizarr.parsers.KerchunkParquetParser][] for re-opening Kerchunk Parquets. + - [virtualizarr.parsers.ZarrParser][] for virtualizing Zarr stores. + - [virtualizarr.parsers.ZarrParser][] for virtualizing Zarr stores. + drop_variables + Variables in the data source to drop before returning. + loadable_variables + Variables in the data source to load as Dask/NumPy arrays instead of as virtual arrays. + decode_times + Bool that is passed into [xarray.open_dataset][]. Allows time to be decoded into a datetime object. + + Returns + ------- + vds + An [xarray.DataTree][] containing virtual chunk references for all variables. + """ + filepath = validate_and_normalize_path_to_uri(url, fs_root=Path.cwd().as_uri()) + + manifest_store = parser( + url=filepath, + registry=registry, + ) + + return manifest_store.to_virtual_datatree( + loadable_variables=loadable_variables, + decode_times=decode_times, + drop_variables=drop_variables, + ) + + def open_virtual_dataset( url: str, registry: ObjectStoreRegistry, @@ -354,6 +416,44 @@ def construct_virtual_dataset( ) +def construct_virtual_datatree( + manifest_store: ManifestStore, + group: str = "", + *, + drop_variables: Iterable[str] | None = None, + loadable_variables: Iterable[str] | None = None, + decode_times: bool | None = None, +) -> xr.DataTree: + """ + Construct a fully or partly virtual datatree from a ManifestStore. + """ + fully_loadable_datatree = xr.open_datatree( + manifest_store, # type: ignore[arg-type] + group=group, + engine="zarr", + consolidated=False, + zarr_format=3, + decode_times=decode_times, + ) + if group: + node = manifest_store._group[group] + else: + node = manifest_store._group + if isinstance(node, ManifestArray): + node = ManifestGroup(arrays={group: node}, attributes={}) + fully_virtual_datatree = node.to_virtual_datatree() + + partially_loaded_datasets = {} + for name, virtual_node in fully_virtual_datatree.subtree_with_keys: + loadable_node = fully_loadable_datatree[name] + node_dataset = replace_virtual_with_loadable_vars( + virtual_node.to_dataset(), loadable_node.to_dataset(), loadable_variables + ) + node_dataset = node_dataset.drop_vars(list(drop_variables or ())) + partially_loaded_datasets[name] = node_dataset + return xr.DataTree.from_dict(partially_loaded_datasets) + + def replace_virtual_with_loadable_vars( fully_virtual_ds: xr.Dataset, loadable_ds: xr.Dataset,