Skip to content
4 changes: 2 additions & 2 deletions docs/tutorials/zarr-v3.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# zarr-v3 Guide/Roadmap

`anndata` now uses the much improved {mod}`zarr` v3 package and also allows writing of datasets in the v3 format via {attr}`anndata.settings.zarr_write_format`, with the exception of structured arrays.
`anndata` now uses the much improved {mod}`zarr` v3 package and also allows writing of datasets in the v3 format via {attr}`anndata.settings.zarr_write_format` via {func}`anndata.io.write_zarr` or {meth}`anndata.AnnData.write_zarr`, with the exception of structured arrays.
Users should notice a significant performance improvement, especially for cloud data, but also likely for local data as well.
Here is a quick guide on some of our learnings so far:

Expand Down Expand Up @@ -48,7 +48,7 @@ import anndata as ad
from collections.abc import Mapping
from typing import Any

ad.settings.zarr_write_format = 3 # Absolutely crucial! Sharding is only for the v3 file format!
g = zarr.open_group(orig_path, mode="a", use_consolidated=False, zarr_version=3) # zarr_version 3 is default but note that sharding only works with v3!
Comment thread
flying-sheep marked this conversation as resolved.

def write_sharded(group: zarr.Group, adata: ad.AnnData):
def callback(
Expand Down
4 changes: 2 additions & 2 deletions src/anndata/_io/specs/methods.py
Comment thread
ilan-gold marked this conversation as resolved.
Original file line number Diff line number Diff line change
Expand Up @@ -630,7 +630,7 @@ def write_vlen_string_array_zarr(
dataset_kwargs = zarr_v3_compressor_compat(dataset_kwargs)
dtype = VariableLengthUTF8()
filters, fill_value = None, None
if ad.settings.zarr_write_format == 2:
if f.metadata.zarr_format == 2:
filters, fill_value = [VLenUTF8()], ""
f.create_array(
k,
Expand Down Expand Up @@ -1283,7 +1283,7 @@ def write_scalar_zarr(
from numcodecs import VLenUTF8
from zarr.core.dtype import VariableLengthUTF8

match ad.settings.zarr_write_format, value:
match f.metadata.zarr_format, value:
case 2, str():
filters, dtype, fill_value = [VLenUTF8()], VariableLengthUTF8(), ""
case 3, str():
Expand Down
2 changes: 1 addition & 1 deletion src/anndata/_settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -447,7 +447,7 @@ def validate_zarr_write_format(format: int):
settings.register(
"zarr_write_format",
default_value=2,
description="Which version of zarr to write to.",
description="Which version of zarr to write to when anndata must internally open a write-able zarr group.",
validate=validate_zarr_write_format,
get_from_env=lambda name, default: check_and_get_environ_var(
f"ANNDATA_{name.upper()}",
Expand Down
8 changes: 6 additions & 2 deletions src/anndata/compat/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -286,8 +286,12 @@ def _to_fixed_length_strings(value: np.ndarray) -> np.ndarray:
"""\
Convert variable length strings to fixed length.

Currently a workaround for
https://github.com/zarr-developers/zarr-python/pull/422
Formerly a workaround for
https://github.com/zarr-developers/zarr-python/pull/422,
resolved in https://github.com/zarr-developers/zarr-python/pull/813.

But if we didn't do this conversion, we would have to use a special codec in v2
for objects and v3 doesn't support objects at all. So we leave this function as-is.
"""
new_dtype = []
for dt_name, (dt_type, dt_offset) in value.dtype.fields.items():
Expand Down
5 changes: 4 additions & 1 deletion tests/test_backed_sparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -513,11 +513,14 @@ def test_data_access(
data = f["X/data"][...]
del f["X/data"]
# chunk one at a time to count properly
kwargs = {}
if not is_zarr_v2():
kwargs["zarr_format"] = f.metadata.zarr_format
zarr.array(
data,
store=path / "X" / "data",
chunks=(1,),
zarr_format=ad.settings.zarr_write_format,
**kwargs,
)
store = AccessTrackingStore(path)
store.initialize_key_trackers(["X/data"])
Expand Down
12 changes: 0 additions & 12 deletions tests/test_io_elementwise.py
Comment thread
flying-sheep marked this conversation as resolved.
Original file line number Diff line number Diff line change
Expand Up @@ -206,14 +206,6 @@ def create_sparse_store(
],
)
def test_io_spec(store, value, encoding_type):
# zarr v3 can't write recarray
Comment thread
ilan-gold marked this conversation as resolved.
# https://github.com/zarr-developers/zarr-python/issues/2134
if (
ad.settings.zarr_write_format == 3
and encoding_type == "anndata"
and "O_recarray" in value.uns
):
del value.uns["O_recarray"]
with ad.settings.override(allow_write_nullable_strings=True):
key = f"key_for_{encoding_type}"
write_elem(store, key, value, dataset_kwargs={})
Expand Down Expand Up @@ -564,10 +556,6 @@ def test_write_to_root(store, value):
"""
Test that elements which are written as groups can we written to the root group.
"""
# zarr v3 can't write recarray
# https://github.com/zarr-developers/zarr-python/issues/2134
if ad.settings.zarr_write_format == 3 and isinstance(value, ad.AnnData):
del value.uns["O_recarray"]
write_elem(store, "/", value)
# See: https://github.com/zarr-developers/zarr-python/issues/2716
if isinstance(store, ZarrGroup) and not is_zarr_v2():
Expand Down
6 changes: 0 additions & 6 deletions tests/test_io_partial.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
import zarr
from scipy.sparse import csr_matrix

import anndata
from anndata import AnnData
from anndata._io.specs.registry import read_elem_partial
from anndata.io import read_elem, write_h5ad, write_zarr
Expand Down Expand Up @@ -51,11 +50,6 @@ def test_read_partial_adata(tmp_path, diskfmt):
import scanpy as sc

adata = sc.datasets.pbmc68k_reduced()
# zarr v3 can't write recarray
# https://github.com/zarr-developers/zarr-python/issues/2134
if anndata.settings.zarr_write_format == 3 and isinstance(adata, AnnData):
del adata.uns["rank_genes_groups"]["scores"]
del adata.uns["rank_genes_groups"]["names"]

path = Path(tmp_path) / ("test_rp." + diskfmt)

Expand Down
20 changes: 15 additions & 5 deletions tests/test_readwrite.py
Original file line number Diff line number Diff line change
Expand Up @@ -843,11 +843,6 @@ def test_scanpy_pbmc68k(tmp_path, diskfmt, roundtrip, diskfmt2):
with warnings.catch_warnings():
warnings.simplefilter("ignore", ad.OldFormatWarning)
pbmc = sc.datasets.pbmc68k_reduced()
# zarr v3 can't write recarray
# https://github.com/zarr-developers/zarr-python/issues/2134
if ad.settings.zarr_write_format == 3:
del pbmc.uns["rank_genes_groups"]["names"]
del pbmc.uns["rank_genes_groups"]["scores"]

from_disk1 = roundtrip(pbmc, filepth1) # Do we read okay
from_disk2 = roundtrip2(from_disk1, filepth2) # Can we round trip
Expand Down Expand Up @@ -1010,3 +1005,18 @@ def test_write_elem_consolidated(tmp_path: Path):
ValueError, match="Cannot overwrite/edit a store with consolidated metadata"
):
ad.io.write_elem(g["obs"], "foo", np.arange(10))


@pytest.mark.zarr_io
@pytest.mark.skipif(is_zarr_v2(), reason="zarr v3 package test")
def test_write_elem_version_mismatch(tmp_path: Path):
zarr_path = tmp_path / "foo.zarr"
adata = ad.AnnData(np.ones((10, 10)))
g = zarr.open_group(
zarr_path,
mode="w",
zarr_format=2 if ad.settings.zarr_write_format == 3 else 3,
)
ad.io.write_elem(g, "/", adata)
adata_roundtripped = ad.read_zarr(g)
assert_equal(adata_roundtripped, adata)