Backport PR #2172 on branch 0.12.x (fix: remove global lock on zarr dense stores from dask) (#2180)

ilan-gold · web-flow · commit b4c0cfb7a308 · 2025-10-27T18:55:16.000Z
diff --git a/.github/workflows/test-cpu.yml b/.github/workflows/test-cpu.yml
@@ -43,7 +43,7 @@ jobs:
     strategy:
       matrix:
         env: ${{ fromJSON(needs.get-environments.outputs.envs) }}
-        io_mark: ["zarr_io", "not zarr_io"]
+        io_mark: ["zarr_io", "not zarr_io", "dask_distributed"] # dask_distributed should not be run with -n auto as it uses a client with processes
     env:  # environment variables for use in codecov’s env_vars tagging
       ENV_NAME: ${{ matrix.env.name }}
       IO_MARK: ${{ matrix.io_mark }}
@@ -72,7 +72,7 @@ jobs:
         env:
           COVERAGE_PROCESS_START: ${{ github.workspace }}/pyproject.toml
         run: |
-          hatch run ${{ matrix.env.name }}:run-cov -v --color=yes -n auto --junitxml=test-data/test-results.xml -m "${{ matrix.io_mark }}" ${{ matrix.env.args }}
+          hatch run ${{ matrix.env.name }}:run-cov -v --color=yes ${{ matrix.io_mark != 'dask_distributed' && '-n auto' || '' }} --junitxml=test-data/test-results.xml -m "${{ matrix.io_mark }}" ${{ matrix.env.args }}
           hatch run ${{ matrix.env.name }}:cov-combine
           hatch run ${{ matrix.env.name }}:coverage xml
 
diff --git a/docs/release-notes/2172.bug.md b/docs/release-notes/2172.bug.md
@@ -0,0 +1 @@
+{func}`dask.array.store` was producing corrupted data with zarr v3 + distributed scheduler + a lock (which we used internally): see {ref}`dask/dask#12109`. Thus dense arrays were potentially being stored with corrupted data. The solution is to remove the lock for newer versions of dask but without the lock in older versions, it is impossible to store the data. Thus versions of dask older than `2025.4.0` will not be supported for writing dense data. {user}`ilan-gold`
diff --git a/pyproject.toml b/pyproject.toml
@@ -174,7 +174,11 @@ testpaths = [
 ]
 # For some reason this effects how logging is shown when tests are run
 xfail_strict = true
-markers = [ "gpu: mark test to run on GPU", "zarr_io: mark tests that involve zarr io" ]
+markers = [
+    "gpu: mark test to run on GPU",
+    "zarr_io: mark tests that involve zarr io",
+    "dask_distributed: tests that need a distributed client with multiple processes",
+]
 
 [tool.ruff]
 src = [ "src" ]
diff --git a/src/anndata/_io/specs/methods.py b/src/anndata/_io/specs/methods.py
@@ -495,31 +495,10 @@ def write_chunked_dense_array_to_group(
 
 @_REGISTRY.register_write(ZarrGroup, views.DaskArrayView, IOSpec("array", "0.2.0"))
 @_REGISTRY.register_write(ZarrGroup, DaskArray, IOSpec("array", "0.2.0"))
-def write_basic_dask_zarr(
-    f: ZarrGroup,
-    k: str,
-    elem: DaskArray,
-    *,
-    _writer: Writer,
-    dataset_kwargs: Mapping[str, Any] = MappingProxyType({}),
-):
-    import dask.array as da
-
-    dataset_kwargs = dataset_kwargs.copy()
-    dataset_kwargs = zarr_v3_compressor_compat(dataset_kwargs)
-    if is_zarr_v2():
-        g = f.require_dataset(k, shape=elem.shape, dtype=elem.dtype, **dataset_kwargs)
-    else:
-        g = f.require_array(k, shape=elem.shape, dtype=elem.dtype, **dataset_kwargs)
-    da.store(elem, g, lock=GLOBAL_LOCK)
-
-
-# Adding this separately because h5py isn't serializable
-# https://github.com/pydata/xarray/issues/4242
 @_REGISTRY.register_write(H5Group, views.DaskArrayView, IOSpec("array", "0.2.0"))
 @_REGISTRY.register_write(H5Group, DaskArray, IOSpec("array", "0.2.0"))
-def write_basic_dask_h5(
-    f: H5Group,
+def write_basic_dask_dask_dense(
+    f: ZarrGroup | H5Group,
     k: str,
     elem: DaskArray,
     *,
@@ -529,11 +508,23 @@ def write_basic_dask_h5(
     import dask.array as da
     import dask.config as dc
 
-    if dc.get("scheduler", None) == "dask.distributed":
+    is_distributed = dc.get("scheduler", None) == "dask.distributed"
+    is_h5 = isinstance(f, H5Group)
+    if is_distributed and is_h5:
         msg = "Cannot write dask arrays to hdf5 when using distributed scheduler"
         raise ValueError(msg)
 
-    g = f.require_dataset(k, shape=elem.shape, dtype=elem.dtype, **dataset_kwargs)
+    dataset_kwargs = dataset_kwargs.copy()
+    if not is_h5:
+        dataset_kwargs = zarr_v3_compressor_compat(dataset_kwargs)
+        # See https://github.com/dask/dask/issues/12109
+        if Version(version("dask")) < Version("2025.4.0") and is_distributed:
+            msg = "Writing dense data with a distributed scheduler to zarr could produce corrupted data with a Lock and will error without one when dask is older than 2025.4.0: https://github.com/dask/dask/issues/12109"
+            raise RuntimeError(msg)
+    if is_zarr_v2() or is_h5:
+        g = f.require_dataset(k, shape=elem.shape, dtype=elem.dtype, **dataset_kwargs)
+    else:
+        g = f.require_array(k, shape=elem.shape, dtype=elem.dtype, **dataset_kwargs)
     da.store(elem, g)
 
 
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -118,11 +118,14 @@ def local_cluster_addr(
     # Adapted from https://pytest-xdist.readthedocs.io/en/latest/how-to.html#making-session-scoped-fixtures-execute-only-once
     import dask.distributed as dd
 
-    def make_cluster() -> dd.LocalCluster:
-        return dd.LocalCluster(n_workers=1, threads_per_worker=1)
+    def make_cluster(worker_id: str) -> dd.LocalCluster:
+        # If we're not using multiple pytest-xdist workers, let the cluster have multiple workers.
+        return dd.LocalCluster(
+            n_workers=1 if worker_id != "master" else 2, threads_per_worker=1
+        )
 
     if worker_id == "master":
-        with make_cluster() as cluster:
+        with make_cluster(worker_id) as cluster:
             yield cluster.scheduler_address
             return
 
@@ -138,7 +141,7 @@ def make_cluster() -> dd.LocalCluster:
         yield address
         return
 
-    with make_cluster() as cluster:
+    with make_cluster(worker_id) as cluster:
         fn.write_text(cluster.scheduler_address)
         lock.release()
         yield cluster.scheduler_address
diff --git a/tests/lazy/test_concat.py b/tests/lazy/test_concat.py
@@ -218,6 +218,7 @@ def test_concat_to_memory_var(
 
 
 @pytest.mark.xdist_group("dask")
+@pytest.mark.dask_distributed
 def test_concat_data_with_cluster_to_memory(
     adata_remote: AnnData, join: Join_T, local_cluster_addr: str
 ) -> None:
diff --git a/tests/test_dask.py b/tests/test_dask.py
@@ -4,11 +4,14 @@
 
 from __future__ import annotations
 
+from importlib.metadata import version
+from pathlib import Path
 from typing import TYPE_CHECKING
 
 import numpy as np
 import pandas as pd
 import pytest
+from packaging.version import Version
 from scipy import sparse
 
 import anndata as ad
@@ -107,6 +110,7 @@ def test_dask_write(adata, tmp_path, diskfmt):
 
 
 @pytest.mark.xdist_group("dask")
+@pytest.mark.dask_distributed
 def test_dask_distributed_write(
     adata: AnnData,
     tmp_path: Path,
@@ -126,8 +130,15 @@ def test_dask_distributed_write(
         adata.obsm["b"] = da.random.random((M, 10))
         adata.varm["a"] = da.random.random((N, 10))
         orig = adata
-        if diskfmt == "h5ad":
-            with pytest.raises(ValueError, match=r"Cannot write dask arrays to hdf5"):
+        is_h5 = diskfmt == "h5ad"
+        is_corrupted_dask = Version(version("dask")) < Version("2025.4.0")
+        if is_corrupted_dask or is_h5:
+            with pytest.raises(
+                ValueError if is_h5 else RuntimeError,
+                match=r"Cannot write dask arrays to hdf5"
+                if is_h5
+                else r"Writing dense data with a distributed scheduler to zarr",
+            ):
                 ad.io.write_elem(g, "", orig)
             return
         ad.io.write_elem(g, "", orig)
@@ -140,6 +151,7 @@ def test_dask_distributed_write(
 
     assert_equal(curr.varm["a"], orig.varm["a"])
     assert_equal(curr.obsm["a"], orig.obsm["a"])
+    assert_equal(curr.X, orig.X)
 
     assert isinstance(curr.X, np.ndarray)
     assert isinstance(curr.obsm["a"], np.ndarray)
diff --git a/tests/test_io_elementwise.py b/tests/test_io_elementwise.py
@@ -334,8 +334,11 @@ def test_read_lazy_subsets_nd_dask(store, n_dims, chunks):
 
 
 @pytest.mark.xdist_group("dask")
+@pytest.mark.dask_distributed
 def test_read_lazy_h5_cluster(
-    sparse_format: Literal["csr", "csc"], tmp_path: Path, local_cluster_addr: str
+    sparse_format: Literal["csr", "csc"],
+    tmp_path: Path,
+    local_cluster_addr: str,
 ) -> None:
     import dask.distributed as dd
 

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	+{func}`dask.array.store` was producing corrupted data with zarr v3 + distributed scheduler + a lock (which we used internally): see {ref}`dask/dask#12109`. Thus dense arrays were potentially being stored with corrupted data. The solution is to remove the lock for newer versions of dask but without the lock in older versions, it is impossible to store the data. Thus versions of dask older than `2025.4.0` will not be supported for writing dense data. {user}`ilan-gold`