-
Notifications
You must be signed in to change notification settings - Fork 68
Add Datatree support in to_icechunk
#1243
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 6 commits
e8b5bba
b168440
29e7682
99a5b05
95ad390
70adcc0
59de7f1
7b0fec5
26e4c11
ea1ad18
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -11,7 +11,7 @@ | |
| from icechunk import IcechunkStore, Session | ||
| from icechunk.session import ForkSession | ||
| from icechunk.vendor.xarray import _choose_default_mode | ||
| from xarray import DataArray, Dataset | ||
| from xarray import DataArray, Dataset, DataTree | ||
| from xarray.backends.common import ArrayWriter | ||
| from xarray.backends.zarr import ZarrStore | ||
|
|
||
|
|
@@ -32,7 +32,10 @@ | |
| ) | ||
|
|
||
| if Version(xr.__version__) > Version("2025.09.0"): | ||
| from xarray.backends.writers import _validate_dataset_names, dump_to_store # type: ignore[import-not-found] | ||
| from xarray.backends.writers import ( # type: ignore[import-not-found] | ||
| _validate_dataset_names, | ||
| dump_to_store, | ||
| ) | ||
| else: | ||
| from xarray.backends.api import _validate_dataset_names, dump_to_store | ||
|
|
||
|
|
@@ -185,6 +188,48 @@ def write_lazy( | |
| return session_merge_reduction(stored_arrays, split_every=split_every) | ||
|
|
||
|
|
||
| def write_ds( | ||
| ds, | ||
| store, | ||
| safe_chunks, | ||
| group, | ||
| mode, | ||
| append_dim, | ||
| region, | ||
| encoding, | ||
| chunkmanager_store_kwargs, | ||
| ): | ||
| writer = _XarrayDatasetWriter(ds, store=store, safe_chunks=safe_chunks) | ||
| writer._open_group(group=group, mode=mode, append_dim=append_dim, region=region) | ||
|
|
||
| # write metadata | ||
| writer.write_metadata(encoding) | ||
| # write in-memory arrays | ||
| writer.write_eager() | ||
| # eagerly write dask arrays | ||
| maybe_fork_session = writer.write_lazy( | ||
| chunkmanager_store_kwargs=chunkmanager_store_kwargs | ||
| ) | ||
|
|
||
| return maybe_fork_session | ||
|
|
||
|
|
||
| # overload because several kwargs are currently forbidden for DataTree, and ``write_inherited_coords`` only applies to DataTree | ||
| @overload | ||
| def to_icechunk( | ||
| obj: DataTree, | ||
| session: Session, | ||
| *, | ||
| mode: ZarrWriteModes | None = None, | ||
| safe_chunks: bool = True, | ||
| encoding: Mapping[Any, Any] | None = None, | ||
| chunkmanager_store_kwargs: MutableMapping[Any, Any] | None = None, | ||
| write_inherited_coords: bool = False, | ||
| split_every: int | None = None, | ||
| ) -> None: ... | ||
|
|
||
|
|
||
| @overload | ||
| def to_icechunk( | ||
| obj: DataArray | Dataset, | ||
| session: Session, | ||
|
|
@@ -197,14 +242,32 @@ def to_icechunk( | |
| encoding: Mapping[Any, Any] | None = None, | ||
| chunkmanager_store_kwargs: MutableMapping[Any, Any] | None = None, | ||
| split_every: int | None = None, | ||
| ) -> None: ... | ||
|
|
||
|
|
||
| def to_icechunk( | ||
| obj: DataArray | Dataset | DataTree, | ||
| session: Session, | ||
| *, | ||
| group: str | None = None, | ||
| mode: ZarrWriteModes | None = None, | ||
| safe_chunks: bool = True, | ||
| append_dim: Hashable | None = None, | ||
| region: Region = None, | ||
| encoding: Mapping[Any, Any] | None = None, | ||
| chunkmanager_store_kwargs: MutableMapping[Any, Any] | None = None, | ||
| write_inherited_coords: bool = False, | ||
| split_every: int | None = None, | ||
| ) -> None: | ||
| """ | ||
| Write an Xarray object to a group of an Icechunk store. | ||
|
|
||
| Parameters | ||
| ---------- | ||
| obj: DataArray or Dataset | ||
| Xarray object to write | ||
| obj: DataArray, Dataset, or DataTree | ||
| Xarray object to write. | ||
|
|
||
| Note: When passing a DataTree, the ``append_dim``, ``region``, and ``group`` parameters are not yet supported. | ||
| session : icechunk.Session | ||
| Writable Icechunk Session | ||
| mode : {"w", "w-", "a", "a-", r+", None}, optional | ||
|
|
@@ -262,6 +325,11 @@ def to_icechunk( | |
| Additional keyword arguments passed on to the `ChunkManager.store` method used to store | ||
| chunked arrays. For example for a dask array additional kwargs will be passed eventually to | ||
| `dask.array.store()`. Experimental API that should not be relied upon. | ||
| write_inherited_coords : bool, default: False | ||
| If true, replicate inherited coordinates on all descendant nodes. | ||
| Otherwise, only write coordinates at the level at which they are | ||
| originally defined. This saves disk space, but requires opening the | ||
| full tree to load inherited coordinates. | ||
| split_every: int, optional | ||
| Number of tasks to merge at every level of the tree reduction. | ||
|
|
||
|
|
@@ -280,11 +348,25 @@ def to_icechunk( | |
| ``append_dim`` at the same time. To create empty arrays to fill | ||
| in with ``region``, use the `_XarrayDatasetWriter` directly. | ||
| """ | ||
|
|
||
| as_dataset = _make_dataset(obj) | ||
| # Validate parameters for DataTree | ||
| if isinstance(obj, DataTree): | ||
| if group is not None: | ||
| raise NotImplementedError( | ||
| "specifying a root group for the tree has not been implemented" | ||
| ) | ||
| if append_dim is not None: | ||
| raise NotImplementedError( | ||
| "The 'append_dim' parameter is not yet supported when writing DataTree objects." | ||
| ) | ||
| if region is not None: | ||
| raise NotImplementedError( | ||
| "The 'region' parameter is not yet supported when writing DataTree objects." | ||
| ) | ||
|
|
||
| # This ugliness is needed so that we allow users to call `to_icechunk` with a dirty Session | ||
| # for _serial_ writes | ||
|
|
||
| # TODO DataTree does not implement `__dask_graph__`, unlike `Dataset`, so will this ever trigger? | ||
| is_dask = is_dask_collection(obj) | ||
| fork: Session | ForkSession | ||
| if is_dask: | ||
|
|
@@ -296,18 +378,47 @@ def to_icechunk( | |
| else: | ||
| fork = session | ||
|
|
||
| writer = _XarrayDatasetWriter(as_dataset, store=fork.store, safe_chunks=safe_chunks) | ||
| if isinstance(obj, DataTree): | ||
| dt = obj | ||
|
|
||
| writer._open_group(group=group, mode=mode, append_dim=append_dim, region=region) | ||
| if encoding is None: | ||
| encoding = {} | ||
| if set(encoding) - set(dt.groups): | ||
| raise ValueError( | ||
| f"unexpected encoding group name(s) provided: {set(encoding) - set(dt.groups)}" | ||
| ) | ||
|
|
||
| for rel_path, node in dt.subtree_with_keys: | ||
| at_root = node is dt | ||
| dataset = node.to_dataset(inherit=write_inherited_coords or at_root) | ||
|
|
||
| # TODO what do I do with all these maybe_fork_sessions here? | ||
dcherian marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| maybe_fork_session = write_ds( | ||
| ds=dataset, | ||
| store=fork.store, | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Note for the future: this should be safe since each iteration of the loop writes to different group, so there are no conflicts.
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Added this as a comment in 7b0fec5 |
||
| safe_chunks=safe_chunks, | ||
| group=dt[rel_path].path, | ||
| mode=mode, | ||
| append_dim=append_dim, | ||
| region=region, | ||
| encoding=encoding, | ||
| chunkmanager_store_kwargs=chunkmanager_store_kwargs, | ||
| ) | ||
|
|
||
| else: | ||
| as_dataset = _make_dataset(obj) | ||
| maybe_fork_session = write_ds( | ||
| ds=as_dataset, | ||
| store=fork.store, | ||
| safe_chunks=safe_chunks, | ||
| group=group, | ||
| mode=mode, | ||
| append_dim=append_dim, | ||
| region=region, | ||
| encoding=encoding, | ||
| chunkmanager_store_kwargs=chunkmanager_store_kwargs, | ||
| ) | ||
|
|
||
| # write metadata | ||
| writer.write_metadata(encoding) | ||
| # write in-memory arrays | ||
| writer.write_eager() | ||
| # eagerly write dask arrays | ||
| maybe_fork_session = writer.write_lazy( | ||
| chunkmanager_store_kwargs=chunkmanager_store_kwargs | ||
| ) | ||
| if is_dask: | ||
| if maybe_fork_session is None: | ||
| raise RuntimeError( | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.