From 8b3eb97841a71306ef848864fd61959474655e4c Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Fri, 25 Jul 2025 10:51:01 +0200 Subject: [PATCH 1/5] fix: allow writing to v3/2 store when setting is v2/3 if possible --- docs/tutorials/zarr-v3.md | 4 ++-- src/anndata/_io/specs/methods.py | 4 ++-- src/anndata/_settings.py | 2 +- tests/test_backed_sparse.py | 2 +- tests/test_io_elementwise.py | 12 ------------ tests/test_io_partial.py | 6 ------ tests/test_readwrite.py | 21 ++++++++++++++++----- 7 files changed, 22 insertions(+), 29 deletions(-) diff --git a/docs/tutorials/zarr-v3.md b/docs/tutorials/zarr-v3.md index a5b4668e3..82d2e48d4 100644 --- a/docs/tutorials/zarr-v3.md +++ b/docs/tutorials/zarr-v3.md @@ -1,6 +1,6 @@ # zarr-v3 Guide/Roadmap -`anndata` now uses the much improved {mod}`zarr` v3 package and also allows writing of datasets in the v3 format via {attr}`anndata.settings.zarr_write_format`, with the exception of structured arrays. +`anndata` now uses the much improved {mod}`zarr` v3 package and also allows writing of datasets in the v3 format via {attr}`anndata.settings.zarr_write_format` via {func}`anndata.io.write_zarr` or {meth}`anndata.AnnData.write_zarr`, with the exception of structured arrays. Users should notice a significant performance improvement, especially for cloud data, but also likely for local data as well. Here is a quick guide on some of our learnings so far: @@ -48,7 +48,7 @@ import anndata as ad from collections.abc import Mapping from typing import Any -ad.settings.zarr_write_format = 3 # Absolutely crucial! Sharding is only for the v3 file format! +g = zarr.open_group(orig_path, mode="a", use_consolidated=False, zarr_version=3) # zarr_version 3 is default but note that sharding only works with v3! def write_sharded(group: zarr.Group, adata: ad.AnnData): def callback( diff --git a/src/anndata/_io/specs/methods.py b/src/anndata/_io/specs/methods.py index 97d1a8640..9fe92dfa2 100644 --- a/src/anndata/_io/specs/methods.py +++ b/src/anndata/_io/specs/methods.py @@ -629,7 +629,7 @@ def write_vlen_string_array_zarr( dataset_kwargs = zarr_v3_compressor_compat(dataset_kwargs) dtype = VariableLengthUTF8() filters, fill_value = None, None - if ad.settings.zarr_write_format == 2: + if f.metadata.zarr_format == 2: filters, fill_value = [VLenUTF8()], "" f.create_array( k, @@ -1283,7 +1283,7 @@ def write_scalar_zarr( from numcodecs import VLenUTF8 from zarr.core.dtype import VariableLengthUTF8 - match ad.settings.zarr_write_format, value: + match f.metadata.zarr_format, value: case 2, str(): filters, dtype, fill_value = [VLenUTF8()], VariableLengthUTF8(), "" case 3, str(): diff --git a/src/anndata/_settings.py b/src/anndata/_settings.py index 22eb3680a..84ea652b7 100644 --- a/src/anndata/_settings.py +++ b/src/anndata/_settings.py @@ -447,7 +447,7 @@ def validate_zarr_write_format(format: int): settings.register( "zarr_write_format", default_value=2, - description="Which version of zarr to write to.", + description="Which version of zarr to write to when anndata must internally open a write-able zarr group.", validate=validate_zarr_write_format, get_from_env=lambda name, default: check_and_get_environ_var( f"ANNDATA_{name.upper()}", diff --git a/tests/test_backed_sparse.py b/tests/test_backed_sparse.py index 3acdc4a55..c4f8deffd 100644 --- a/tests/test_backed_sparse.py +++ b/tests/test_backed_sparse.py @@ -517,7 +517,7 @@ def test_data_access( data, store=path / "X" / "data", chunks=(1,), - zarr_format=ad.settings.zarr_write_format, + zarr_format=f.metadata.zarr_format, ) store = AccessTrackingStore(path) store.initialize_key_trackers(["X/data"]) diff --git a/tests/test_io_elementwise.py b/tests/test_io_elementwise.py index e03f21842..3bb18e489 100644 --- a/tests/test_io_elementwise.py +++ b/tests/test_io_elementwise.py @@ -206,14 +206,6 @@ def create_sparse_store( ], ) def test_io_spec(store, value, encoding_type): - # zarr v3 can't write recarray - # https://github.com/zarr-developers/zarr-python/issues/2134 - if ( - ad.settings.zarr_write_format == 3 - and encoding_type == "anndata" - and "O_recarray" in value.uns - ): - del value.uns["O_recarray"] with ad.settings.override(allow_write_nullable_strings=True): key = f"key_for_{encoding_type}" write_elem(store, key, value, dataset_kwargs={}) @@ -564,10 +556,6 @@ def test_write_to_root(store, value): """ Test that elements which are written as groups can we written to the root group. """ - # zarr v3 can't write recarray - # https://github.com/zarr-developers/zarr-python/issues/2134 - if ad.settings.zarr_write_format == 3 and isinstance(value, ad.AnnData): - del value.uns["O_recarray"] write_elem(store, "/", value) # See: https://github.com/zarr-developers/zarr-python/issues/2716 if isinstance(store, ZarrGroup) and not is_zarr_v2(): diff --git a/tests/test_io_partial.py b/tests/test_io_partial.py index 1476d6f4f..7ed96e9fa 100644 --- a/tests/test_io_partial.py +++ b/tests/test_io_partial.py @@ -10,7 +10,6 @@ import zarr from scipy.sparse import csr_matrix -import anndata from anndata import AnnData from anndata._io.specs.registry import read_elem_partial from anndata.io import read_elem, write_h5ad, write_zarr @@ -51,11 +50,6 @@ def test_read_partial_adata(tmp_path, diskfmt): import scanpy as sc adata = sc.datasets.pbmc68k_reduced() - # zarr v3 can't write recarray - # https://github.com/zarr-developers/zarr-python/issues/2134 - if anndata.settings.zarr_write_format == 3 and isinstance(adata, AnnData): - del adata.uns["rank_genes_groups"]["scores"] - del adata.uns["rank_genes_groups"]["names"] path = Path(tmp_path) / ("test_rp." + diskfmt) diff --git a/tests/test_readwrite.py b/tests/test_readwrite.py index dd41e994a..5c513866e 100644 --- a/tests/test_readwrite.py +++ b/tests/test_readwrite.py @@ -821,11 +821,6 @@ def test_scanpy_pbmc68k(tmp_path, diskfmt, roundtrip, diskfmt2): with warnings.catch_warnings(): warnings.simplefilter("ignore", ad.OldFormatWarning) pbmc = sc.datasets.pbmc68k_reduced() - # zarr v3 can't write recarray - # https://github.com/zarr-developers/zarr-python/issues/2134 - if ad.settings.zarr_write_format == 3: - del pbmc.uns["rank_genes_groups"]["names"] - del pbmc.uns["rank_genes_groups"]["scores"] from_disk1 = roundtrip(pbmc, filepth1) # Do we read okay from_disk2 = roundtrip2(from_disk1, filepth2) # Can we round trip @@ -985,3 +980,19 @@ def test_write_elem_consolidated(tmp_path: Path): ValueError, match="Cannot overwrite/edit a store with consolidated metadata" ): ad.io.write_elem(g["obs"], "foo", np.arange(10)) + + +@pytest.mark.zarr_io +def test_write_elem_version_mismatch(tmp_path: Path): + if is_zarr_v2(): + pytest.skip("zarr v3 package test") + zarr_path = tmp_path / "foo.zarr" + adata = ad.AnnData(np.ones((10, 10))) + g = zarr.open_group( + zarr_path, + mode="w", + zarr_format=2 if ad.settings.zarr_write_format == 3 else 3, + ) + ad.io.write_elem(g, "/", adata) + adata_roundtripped = ad.read_zarr(g) + assert_equal(adata_roundtripped, adata) From 510d8680987a1bba6e009b33c7cdf24db242a5d7 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Fri, 25 Jul 2025 10:57:05 +0200 Subject: [PATCH 2/5] fix: `zarr_format` --- tests/test_backed_sparse.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/test_backed_sparse.py b/tests/test_backed_sparse.py index c4f8deffd..2f8381279 100644 --- a/tests/test_backed_sparse.py +++ b/tests/test_backed_sparse.py @@ -513,11 +513,14 @@ def test_data_access( data = f["X/data"][...] del f["X/data"] # chunk one at a time to count properly + kwargs = {} + if not is_zarr_v2(): + kwargs["zarr_format"] = f.metadata.zarr_format zarr.array( data, store=path / "X" / "data", chunks=(1,), - zarr_format=f.metadata.zarr_format, + **kwargs, ) store = AccessTrackingStore(path) store.initialize_key_trackers(["X/data"]) From 1b4840c93bb4ab3d0f8fec1ca73b6924208c1a68 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 28 Jul 2025 12:04:15 +0200 Subject: [PATCH 3/5] fix: small things from pr review --- src/anndata/_io/specs/methods.py | 1 - tests/test_readwrite.py | 3 +-- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/src/anndata/_io/specs/methods.py b/src/anndata/_io/specs/methods.py index 9fe92dfa2..a343e352b 100644 --- a/src/anndata/_io/specs/methods.py +++ b/src/anndata/_io/specs/methods.py @@ -700,7 +700,6 @@ def write_recarray_zarr( else: dataset_kwargs = dataset_kwargs.copy() dataset_kwargs = zarr_v3_compressor_compat(dataset_kwargs) - # TODO: zarr’s on-disk format v3 doesn’t support this dtype f.create_array(k, shape=elem.shape, dtype=elem.dtype, **dataset_kwargs) f[k][...] = elem diff --git a/tests/test_readwrite.py b/tests/test_readwrite.py index 5c513866e..910ce1ce2 100644 --- a/tests/test_readwrite.py +++ b/tests/test_readwrite.py @@ -983,9 +983,8 @@ def test_write_elem_consolidated(tmp_path: Path): @pytest.mark.zarr_io +@pytest.mark.skipif(is_zarr_v2(), reason="zarr v3 package test") def test_write_elem_version_mismatch(tmp_path: Path): - if is_zarr_v2(): - pytest.skip("zarr v3 package test") zarr_path = tmp_path / "foo.zarr" adata = ad.AnnData(np.ones((10, 10))) g = zarr.open_group( From ee6296d539a7fbf64a35d6a4d3647d8dc066656b Mon Sep 17 00:00:00 2001 From: "Philipp A." Date: Mon, 28 Jul 2025 12:12:29 +0200 Subject: [PATCH 4/5] remove useless check --- src/anndata/_io/specs/methods.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/anndata/_io/specs/methods.py b/src/anndata/_io/specs/methods.py index a343e352b..82f455198 100644 --- a/src/anndata/_io/specs/methods.py +++ b/src/anndata/_io/specs/methods.py @@ -695,7 +695,7 @@ def write_recarray_zarr( from anndata.compat import _to_fixed_length_strings elem = _to_fixed_length_strings(elem) - if isinstance(f, H5Group) or is_zarr_v2(): + if is_zarr_v2(): f.create_dataset(k, data=elem, shape=elem.shape, **dataset_kwargs) else: dataset_kwargs = dataset_kwargs.copy() From 7dd768926a06587dab8e132ff562e244a4dfba32 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 28 Jul 2025 14:35:03 +0200 Subject: [PATCH 5/5] chore: add comment --- src/anndata/compat/__init__.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/anndata/compat/__init__.py b/src/anndata/compat/__init__.py index 00e81a80d..6eb4da48b 100644 --- a/src/anndata/compat/__init__.py +++ b/src/anndata/compat/__init__.py @@ -286,8 +286,12 @@ def _to_fixed_length_strings(value: np.ndarray) -> np.ndarray: """\ Convert variable length strings to fixed length. - Currently a workaround for - https://github.com/zarr-developers/zarr-python/pull/422 + Formerly a workaround for + https://github.com/zarr-developers/zarr-python/pull/422, + resolved in https://github.com/zarr-developers/zarr-python/pull/813. + + But if we didn't do this conversion, we would have to use a special codec in v2 + for objects and v3 doesn't support objects at all. So we leave this function as-is. """ new_dtype = [] for dt_name, (dt_type, dt_offset) in value.dtype.fields.items():