diff --git a/docs/tutorials/zarr-v3.md b/docs/tutorials/zarr-v3.md index a5b4668e3..82d2e48d4 100644 --- a/docs/tutorials/zarr-v3.md +++ b/docs/tutorials/zarr-v3.md @@ -1,6 +1,6 @@ # zarr-v3 Guide/Roadmap -`anndata` now uses the much improved {mod}`zarr` v3 package and also allows writing of datasets in the v3 format via {attr}`anndata.settings.zarr_write_format`, with the exception of structured arrays. +`anndata` now uses the much improved {mod}`zarr` v3 package and also allows writing of datasets in the v3 format via {attr}`anndata.settings.zarr_write_format` via {func}`anndata.io.write_zarr` or {meth}`anndata.AnnData.write_zarr`, with the exception of structured arrays. Users should notice a significant performance improvement, especially for cloud data, but also likely for local data as well. Here is a quick guide on some of our learnings so far: @@ -48,7 +48,7 @@ import anndata as ad from collections.abc import Mapping from typing import Any -ad.settings.zarr_write_format = 3 # Absolutely crucial! Sharding is only for the v3 file format! +g = zarr.open_group(orig_path, mode="a", use_consolidated=False, zarr_version=3) # zarr_version 3 is default but note that sharding only works with v3! def write_sharded(group: zarr.Group, adata: ad.AnnData): def callback( diff --git a/src/anndata/_io/specs/methods.py b/src/anndata/_io/specs/methods.py index c154f5776..a91f5b2fe 100644 --- a/src/anndata/_io/specs/methods.py +++ b/src/anndata/_io/specs/methods.py @@ -630,7 +630,7 @@ def write_vlen_string_array_zarr( dataset_kwargs = zarr_v3_compressor_compat(dataset_kwargs) dtype = VariableLengthUTF8() filters, fill_value = None, None - if ad.settings.zarr_write_format == 2: + if f.metadata.zarr_format == 2: filters, fill_value = [VLenUTF8()], "" f.create_array( k, @@ -1283,7 +1283,7 @@ def write_scalar_zarr( from numcodecs import VLenUTF8 from zarr.core.dtype import VariableLengthUTF8 - match ad.settings.zarr_write_format, value: + match f.metadata.zarr_format, value: case 2, str(): filters, dtype, fill_value = [VLenUTF8()], VariableLengthUTF8(), "" case 3, str(): diff --git a/src/anndata/_settings.py b/src/anndata/_settings.py index 22eb3680a..84ea652b7 100644 --- a/src/anndata/_settings.py +++ b/src/anndata/_settings.py @@ -447,7 +447,7 @@ def validate_zarr_write_format(format: int): settings.register( "zarr_write_format", default_value=2, - description="Which version of zarr to write to.", + description="Which version of zarr to write to when anndata must internally open a write-able zarr group.", validate=validate_zarr_write_format, get_from_env=lambda name, default: check_and_get_environ_var( f"ANNDATA_{name.upper()}", diff --git a/src/anndata/compat/__init__.py b/src/anndata/compat/__init__.py index 00e81a80d..6eb4da48b 100644 --- a/src/anndata/compat/__init__.py +++ b/src/anndata/compat/__init__.py @@ -286,8 +286,12 @@ def _to_fixed_length_strings(value: np.ndarray) -> np.ndarray: """\ Convert variable length strings to fixed length. - Currently a workaround for - https://github.com/zarr-developers/zarr-python/pull/422 + Formerly a workaround for + https://github.com/zarr-developers/zarr-python/pull/422, + resolved in https://github.com/zarr-developers/zarr-python/pull/813. + + But if we didn't do this conversion, we would have to use a special codec in v2 + for objects and v3 doesn't support objects at all. So we leave this function as-is. """ new_dtype = [] for dt_name, (dt_type, dt_offset) in value.dtype.fields.items(): diff --git a/tests/test_backed_sparse.py b/tests/test_backed_sparse.py index 3acdc4a55..2f8381279 100644 --- a/tests/test_backed_sparse.py +++ b/tests/test_backed_sparse.py @@ -513,11 +513,14 @@ def test_data_access( data = f["X/data"][...] del f["X/data"] # chunk one at a time to count properly + kwargs = {} + if not is_zarr_v2(): + kwargs["zarr_format"] = f.metadata.zarr_format zarr.array( data, store=path / "X" / "data", chunks=(1,), - zarr_format=ad.settings.zarr_write_format, + **kwargs, ) store = AccessTrackingStore(path) store.initialize_key_trackers(["X/data"]) diff --git a/tests/test_io_elementwise.py b/tests/test_io_elementwise.py index a1f34287f..cdf6769c3 100644 --- a/tests/test_io_elementwise.py +++ b/tests/test_io_elementwise.py @@ -206,14 +206,6 @@ def create_sparse_store( ], ) def test_io_spec(store, value, encoding_type): - # zarr v3 can't write recarray - # https://github.com/zarr-developers/zarr-python/issues/2134 - if ( - ad.settings.zarr_write_format == 3 - and encoding_type == "anndata" - and "O_recarray" in value.uns - ): - del value.uns["O_recarray"] with ad.settings.override(allow_write_nullable_strings=True): key = f"key_for_{encoding_type}" write_elem(store, key, value, dataset_kwargs={}) @@ -564,10 +556,6 @@ def test_write_to_root(store, value): """ Test that elements which are written as groups can we written to the root group. """ - # zarr v3 can't write recarray - # https://github.com/zarr-developers/zarr-python/issues/2134 - if ad.settings.zarr_write_format == 3 and isinstance(value, ad.AnnData): - del value.uns["O_recarray"] write_elem(store, "/", value) # See: https://github.com/zarr-developers/zarr-python/issues/2716 if isinstance(store, ZarrGroup) and not is_zarr_v2(): diff --git a/tests/test_io_partial.py b/tests/test_io_partial.py index 1476d6f4f..7ed96e9fa 100644 --- a/tests/test_io_partial.py +++ b/tests/test_io_partial.py @@ -10,7 +10,6 @@ import zarr from scipy.sparse import csr_matrix -import anndata from anndata import AnnData from anndata._io.specs.registry import read_elem_partial from anndata.io import read_elem, write_h5ad, write_zarr @@ -51,11 +50,6 @@ def test_read_partial_adata(tmp_path, diskfmt): import scanpy as sc adata = sc.datasets.pbmc68k_reduced() - # zarr v3 can't write recarray - # https://github.com/zarr-developers/zarr-python/issues/2134 - if anndata.settings.zarr_write_format == 3 and isinstance(adata, AnnData): - del adata.uns["rank_genes_groups"]["scores"] - del adata.uns["rank_genes_groups"]["names"] path = Path(tmp_path) / ("test_rp." + diskfmt) diff --git a/tests/test_readwrite.py b/tests/test_readwrite.py index 7039af709..24c965969 100644 --- a/tests/test_readwrite.py +++ b/tests/test_readwrite.py @@ -843,11 +843,6 @@ def test_scanpy_pbmc68k(tmp_path, diskfmt, roundtrip, diskfmt2): with warnings.catch_warnings(): warnings.simplefilter("ignore", ad.OldFormatWarning) pbmc = sc.datasets.pbmc68k_reduced() - # zarr v3 can't write recarray - # https://github.com/zarr-developers/zarr-python/issues/2134 - if ad.settings.zarr_write_format == 3: - del pbmc.uns["rank_genes_groups"]["names"] - del pbmc.uns["rank_genes_groups"]["scores"] from_disk1 = roundtrip(pbmc, filepth1) # Do we read okay from_disk2 = roundtrip2(from_disk1, filepth2) # Can we round trip @@ -1010,3 +1005,18 @@ def test_write_elem_consolidated(tmp_path: Path): ValueError, match="Cannot overwrite/edit a store with consolidated metadata" ): ad.io.write_elem(g["obs"], "foo", np.arange(10)) + + +@pytest.mark.zarr_io +@pytest.mark.skipif(is_zarr_v2(), reason="zarr v3 package test") +def test_write_elem_version_mismatch(tmp_path: Path): + zarr_path = tmp_path / "foo.zarr" + adata = ad.AnnData(np.ones((10, 10))) + g = zarr.open_group( + zarr_path, + mode="w", + zarr_format=2 if ad.settings.zarr_write_format == 3 else 3, + ) + ad.io.write_elem(g, "/", adata) + adata_roundtripped = ad.read_zarr(g) + assert_equal(adata_roundtripped, adata)