Skip to content

Commit e5f2c7c

Browse files
feat: migrate to zarr v3 defaults (#2439)
Co-authored-by: Philipp A. <flying-sheep@web.de>
1 parent c87dc0a commit e5f2c7c

11 files changed

Lines changed: 54 additions & 55 deletions

File tree

docs/release-notes/2439.feat.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
{attr}`anndata.settings.zarr_write_format` is now 3 and {func}`anndata.settings.auto_shard_zarr_v3` is now `True` (with `None` removed as an option).
2+
This means that sharded zarr v3 stores will be written with a target uncompressed shard size of 1GB unless the user either
3+
4+
- changes one of these flags, or
5+
- explicitly creates a {class}`zarr.Group` with `zarr_format=2`, or
6+
- overrides the `shards` in `dataset_kwargs` passed in to {func}`~anndata.io.write_elem`
7+
8+
{user}`ilan-gold`

src/anndata/_io/specs/methods.py

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -125,12 +125,7 @@ def zarr_v3_sharding(dataset_kwargs: dict, format: Literal[2, 3]) -> Generator[d
125125
and ad.settings.auto_shard_zarr_v3
126126
and format == 3
127127
)
128-
if ad.settings.auto_shard_zarr_v3 is None and format == 3:
129-
warn(
130-
"zarr v3 autosharding will be the default in the next minor release.",
131-
UserWarning,
132-
)
133-
elif auto_sharding:
128+
if auto_sharding:
134129
dataset_kwargs = {**dataset_kwargs, "shards": "auto"}
135130
# Auto shard sizes are a relatively recent feature
136131
supports_auto_shard_size = Version(version("zarr")) >= Version("3.1.4")

src/anndata/_io/specs/registry.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -483,8 +483,11 @@ def read_elem_lazy(
483483
484484
Reading a dense matrix from a zarr store lazily:
485485
486+
..
487+
TODO: remove “SKIP” once https://github.com/zarr-developers/zarr-python/issues/3602 becomes minimum zarr (3.1.6)
488+
486489
>>> adata.layers["dense"] = ad.experimental.read_elem_lazy(g["layers/dense"])
487-
>>> adata.layers["dense"]
490+
>>> adata.layers["dense"] # doctest: +SKIP
488491
dask.array<from-zarr, shape=(2700, 32738), dtype=float32, chunksize=(169, 2047), chunktype=numpy.ndarray>
489492
490493
Making a new anndata object from on-disk, with custom chunks:

src/anndata/_settings.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -460,7 +460,7 @@ def validate_zarr_write_format(format: int, settings: SettingsManager):
460460

461461
settings.register(
462462
"zarr_write_format",
463-
default_value=2,
463+
default_value=3,
464464
description="Which version of zarr to write to when anndata must internally open a write-able zarr group.",
465465
validate=validate_zarr_write_format,
466466
get_from_env=lambda name, default: check_and_get_environ_var(
@@ -507,11 +507,11 @@ def validate_sparse_settings(val: Any, settings: SettingsManager) -> None:
507507

508508
settings.register(
509509
"auto_shard_zarr_v3",
510-
default_value=None,
510+
default_value=True,
511511
description="Whether or not to use zarr's auto computation of sharding for v3. For v2 this setting will be ignored. The setting will apply to all calls to anndata's writing mechanism (write_zarr / write_elem) and will **not** override any user-defined kwargs for shards.",
512-
validate=gen_validator((bool, NoneType)),
512+
validate=validate_bool,
513513
option_type=bool | None,
514-
get_from_env=check_and_get_bool_or_none,
514+
get_from_env=check_and_get_bool,
515515
)
516516

517517

src/anndata/_settings.pyi

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -41,11 +41,11 @@ class _AnnDataSettingsManager(SettingsManager):
4141
check_uniqueness: bool = True
4242
copy_on_write_X: bool = False
4343
allow_write_nullable_strings: bool | None = None
44-
zarr_write_format: Literal[2, 3] = 2
44+
zarr_write_format: Literal[2, 3] = 3
4545
use_sparse_array_on_read: bool = False
4646
min_rows_for_chunked_h5_copy: int = 1000
4747
disallow_forward_slash_in_h5ad: bool = False
4848
write_csr_csc_indices_with_min_possible_dtype: bool = False
49-
auto_shard_zarr_v3: bool | None = None
49+
auto_shard_zarr_v3: bool = True
5050

5151
settings: _AnnDataSettingsManager

tests/conftest.py

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -33,12 +33,6 @@ def backing_h5ad(tmp_path: Path) -> Path:
3333
return tmp_path / "test.h5ad"
3434

3535

36-
@pytest.fixture(autouse=True)
37-
def zarr_shard(request: pytest.FixtureRequest):
38-
with ad.settings.override(auto_shard_zarr_v3=True):
39-
yield
40-
41-
4236
@pytest.fixture(
4337
params=[("h5ad", None), ("zarr", 2), ("zarr", 3)],
4438
ids=["h5ad", "zarr2", "zarr3"],

tests/lazy/conftest.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -141,7 +141,8 @@ def adata_remote_with_store_tall_skinny_path(
141141
g,
142142
"obs",
143143
obs,
144-
dataset_kwargs=dict(chunks=(250,)),
144+
# No shards so we can track chunking exactly.
145+
dataset_kwargs=dict(chunks=(250,), shards=None),
145146
)
146147
zarr.consolidate_metadata(g.store)
147148
return orig_path

tests/lazy/test_read.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
from __future__ import annotations
22

3+
import json
34
from importlib.util import find_spec
5+
from pathlib import Path
46
from typing import TYPE_CHECKING
57

68
import numpy as np
@@ -23,7 +25,6 @@
2325

2426
if TYPE_CHECKING:
2527
from collections.abc import Callable
26-
from pathlib import Path
2728

2829
from anndata._types import AnnDataElem
2930

@@ -188,14 +189,19 @@ def test_unconsolidated(tmp_path: Path, mtx_format):
188189
adata = gen_adata((10, 10), mtx_format, **GEN_ADATA_NO_XARRAY_ARGS)
189190
orig_pth = tmp_path / "orig.zarr"
190191
adata.write_zarr(orig_pth)
191-
(orig_pth / ".zmetadata").unlink()
192+
with Path.open(orig_pth / "zarr.json", "r+") as f:
193+
data = json.load(f)
194+
del data["consolidated_metadata"]
195+
f.seek(0)
196+
json.dump(data, f)
197+
f.truncate()
192198
store = AccessTrackingStore(orig_pth, read_only=True)
193-
store.initialize_key_trackers(["obs/.zgroup", ".zgroup"])
199+
store.initialize_key_trackers(["obs/zarr.json", "zarr.json"])
194200
with pytest.warns(UserWarning, match=r"Did not read zarr as consolidated"):
195201
remote = read_lazy(store)
196202
remote_to_memory = remote.to_memory()
197203
assert_equal(remote_to_memory, adata)
198-
store.assert_access_count("obs/.zgroup", 1)
204+
store.assert_access_count("obs/zarr.json", 1)
199205

200206

201207
@pytest.mark.zarr_io

tests/test_backed_sparse.py

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -397,10 +397,7 @@ def test_lazy_array_cache(
397397
a_disk[3:5]
398398
a_disk[6:7]
399399
a_disk[8:9]
400-
# Three hits for metadata in zarr v3:
401-
# see https://github.com/zarr-developers/zarr-python/discussions/2760 for more info on the difference.
402-
# Then there is actual data access, 1 more when cached, 4 more otherwise.
403-
c_expected = 4 if should_cache_indptr else 7
400+
c_expected = 2 if should_cache_indptr else 5
404401
assert store.get_access_count("X/indptr") == c_expected
405402
for elem_not_indptr in elems - {"indptr"}:
406403
assert (
@@ -540,9 +537,9 @@ def test_data_access(
540537
subset.X.compute(scheduler="single-threaded")
541538
# zarr v2 fetches all and not just metadata for that node in 3.X.X python package
542539
# TODO: https://github.com/zarr-developers/zarr-python/discussions/2760
543-
if (
544-
ad.settings.zarr_write_format == 2 and read_data
545-
) or open_func is not sparse_dataset:
540+
if ad.settings.zarr_write_format == 2 and (
541+
read_data or open_func is not sparse_dataset
542+
):
546543
exp = [*exp, "X/data/.zgroup", "X/data/.zattrs"]
547544

548545
assert store.get_access_count("X/data") == len(exp), store.get_accessed_keys(

tests/test_concatenate_disk.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
from scipy import sparse
1212

1313
import anndata as ad
14-
from anndata import AnnData, concat, settings
14+
from anndata import AnnData, concat
1515
from anndata._core import merge
1616
from anndata._core.merge import _resolve_axis
1717
from anndata.experimental.merge import as_group, concat_on_disk
@@ -261,11 +261,10 @@ def test_concatenate_xxxm(xxxm_adatas, tmp_path, file_format, join_type):
261261
assert_eq_concat_on_disk(xxxm_adatas, tmp_path, file_format, join=join_type)
262262

263263

264-
def test_concatenate_zarr_v3_shard(xxxm_adatas, tmp_path):
264+
def test_concatenate_zarr_stays_sharded_v3(xxxm_adatas, tmp_path):
265265
import zarr
266266

267-
with settings.override(auto_shard_zarr_v3=True, zarr_write_format=3):
268-
assert_eq_concat_on_disk(xxxm_adatas, tmp_path, file_format="zarr")
267+
assert_eq_concat_on_disk(xxxm_adatas, tmp_path, file_format="zarr")
269268
g = zarr.open(tmp_path)
270269
assert g.metadata.zarr_format == 3
271270

0 commit comments

Comments
 (0)