Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions docs/tutorials/zarr-v3.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# zarr-v3 Guide/Roadmap

`anndata` now uses the much improved {mod}`zarr` v3 package and also allows writing of datasets in the v3 format via {attr}`anndata.settings.zarr_write_format`, with the exception of structured arrays.
`anndata` now uses the much improved {mod}`zarr` v3 package and also allows writing of datasets in the v3 format via {attr}`anndata.settings.zarr_write_format` via {func}`anndata.io.write_zarr` or {meth}`anndata.AnnData.write_zarr`, with the exception of structured arrays.
Users should notice a significant performance improvement, especially for cloud data, but also likely for local data as well.
Here is a quick guide on some of our learnings so far:

Expand Down Expand Up @@ -48,7 +48,7 @@ import anndata as ad
from collections.abc import Mapping
from typing import Any

ad.settings.zarr_write_format = 3 # Absolutely crucial! Sharding is only for the v3 file format!
g = zarr.open_group(orig_path, mode="a", use_consolidated=False, zarr_version=3) # zarr_version 3 is default but note that sharding only works with v3!

def write_sharded(group: zarr.Group, adata: ad.AnnData):
def callback(
Expand Down
4 changes: 2 additions & 2 deletions src/anndata/_io/specs/methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -630,7 +630,7 @@ def write_vlen_string_array_zarr(
dataset_kwargs = zarr_v3_compressor_compat(dataset_kwargs)
dtype = VariableLengthUTF8()
filters, fill_value = None, None
if ad.settings.zarr_write_format == 2:
if f.metadata.zarr_format == 2:
filters, fill_value = [VLenUTF8()], ""
f.create_array(
k,
Expand Down Expand Up @@ -1283,7 +1283,7 @@ def write_scalar_zarr(
from numcodecs import VLenUTF8
from zarr.core.dtype import VariableLengthUTF8

match ad.settings.zarr_write_format, value:
match f.metadata.zarr_format, value:
case 2, str():
filters, dtype, fill_value = [VLenUTF8()], VariableLengthUTF8(), ""
case 3, str():
Expand Down
2 changes: 1 addition & 1 deletion src/anndata/_settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -447,7 +447,7 @@ def validate_zarr_write_format(format: int):
settings.register(
"zarr_write_format",
default_value=2,
description="Which version of zarr to write to.",
description="Which version of zarr to write to when anndata must internally open a write-able zarr group.",
validate=validate_zarr_write_format,
get_from_env=lambda name, default: check_and_get_environ_var(
f"ANNDATA_{name.upper()}",
Expand Down
8 changes: 6 additions & 2 deletions src/anndata/compat/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -286,8 +286,12 @@ def _to_fixed_length_strings(value: np.ndarray) -> np.ndarray:
"""\
Convert variable length strings to fixed length.

Currently a workaround for
https://github.com/zarr-developers/zarr-python/pull/422
Formerly a workaround for
https://github.com/zarr-developers/zarr-python/pull/422,
resolved in https://github.com/zarr-developers/zarr-python/pull/813.

But if we didn't do this conversion, we would have to use a special codec in v2
for objects and v3 doesn't support objects at all. So we leave this function as-is.
"""
new_dtype = []
for dt_name, (dt_type, dt_offset) in value.dtype.fields.items():
Expand Down
5 changes: 4 additions & 1 deletion tests/test_backed_sparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -513,11 +513,14 @@ def test_data_access(
data = f["X/data"][...]
del f["X/data"]
# chunk one at a time to count properly
kwargs = {}
if not is_zarr_v2():
kwargs["zarr_format"] = f.metadata.zarr_format
zarr.array(
data,
store=path / "X" / "data",
chunks=(1,),
zarr_format=ad.settings.zarr_write_format,
**kwargs,
)
store = AccessTrackingStore(path)
store.initialize_key_trackers(["X/data"])
Expand Down
12 changes: 0 additions & 12 deletions tests/test_io_elementwise.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,14 +206,6 @@ def create_sparse_store(
],
)
def test_io_spec(store, value, encoding_type):
# zarr v3 can't write recarray
# https://github.com/zarr-developers/zarr-python/issues/2134
if (
ad.settings.zarr_write_format == 3
and encoding_type == "anndata"
and "O_recarray" in value.uns
):
del value.uns["O_recarray"]
with ad.settings.override(allow_write_nullable_strings=True):
key = f"key_for_{encoding_type}"
write_elem(store, key, value, dataset_kwargs={})
Expand Down Expand Up @@ -564,10 +556,6 @@ def test_write_to_root(store, value):
"""
Test that elements which are written as groups can we written to the root group.
"""
# zarr v3 can't write recarray
# https://github.com/zarr-developers/zarr-python/issues/2134
if ad.settings.zarr_write_format == 3 and isinstance(value, ad.AnnData):
del value.uns["O_recarray"]
write_elem(store, "/", value)
# See: https://github.com/zarr-developers/zarr-python/issues/2716
if isinstance(store, ZarrGroup) and not is_zarr_v2():
Expand Down
6 changes: 0 additions & 6 deletions tests/test_io_partial.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
import zarr
from scipy.sparse import csr_matrix

import anndata
from anndata import AnnData
from anndata._io.specs.registry import read_elem_partial
from anndata.io import read_elem, write_h5ad, write_zarr
Expand Down Expand Up @@ -51,11 +50,6 @@ def test_read_partial_adata(tmp_path, diskfmt):
import scanpy as sc

adata = sc.datasets.pbmc68k_reduced()
# zarr v3 can't write recarray
# https://github.com/zarr-developers/zarr-python/issues/2134
if anndata.settings.zarr_write_format == 3 and isinstance(adata, AnnData):
del adata.uns["rank_genes_groups"]["scores"]
del adata.uns["rank_genes_groups"]["names"]

path = Path(tmp_path) / ("test_rp." + diskfmt)

Expand Down
20 changes: 15 additions & 5 deletions tests/test_readwrite.py
Original file line number Diff line number Diff line change
Expand Up @@ -843,11 +843,6 @@ def test_scanpy_pbmc68k(tmp_path, diskfmt, roundtrip, diskfmt2):
with warnings.catch_warnings():
warnings.simplefilter("ignore", ad.OldFormatWarning)
pbmc = sc.datasets.pbmc68k_reduced()
# zarr v3 can't write recarray
# https://github.com/zarr-developers/zarr-python/issues/2134
if ad.settings.zarr_write_format == 3:
del pbmc.uns["rank_genes_groups"]["names"]
del pbmc.uns["rank_genes_groups"]["scores"]

from_disk1 = roundtrip(pbmc, filepth1) # Do we read okay
from_disk2 = roundtrip2(from_disk1, filepth2) # Can we round trip
Expand Down Expand Up @@ -1010,3 +1005,18 @@ def test_write_elem_consolidated(tmp_path: Path):
ValueError, match="Cannot overwrite/edit a store with consolidated metadata"
):
ad.io.write_elem(g["obs"], "foo", np.arange(10))


@pytest.mark.zarr_io
@pytest.mark.skipif(is_zarr_v2(), reason="zarr v3 package test")
def test_write_elem_version_mismatch(tmp_path: Path):
zarr_path = tmp_path / "foo.zarr"
adata = ad.AnnData(np.ones((10, 10)))
g = zarr.open_group(
zarr_path,
mode="w",
zarr_format=2 if ad.settings.zarr_write_format == 3 else 3,
)
ad.io.write_elem(g, "/", adata)
adata_roundtripped = ad.read_zarr(g)
assert_equal(adata_roundtripped, adata)