fix: merge strategies for concat_on_disk (#2122) (#2142)

ilan-gold · milos7250 · pre-commit-ci[bot] · web-flow · commit 795fb3424589 · 2025-10-02T10:18:55.000Z
Co-authored-by: Miloš Mičík &lt;56844787+milos7250@users.noreply.github.com&gt;
Co-authored-by: pre-commit-ci[bot] &lt;66853113+pre-commit-ci[bot]@users.noreply.github.com&gt;
Co-authored-by: amalia-k510 &lt;kareshamal@gmail.com&gt;
diff --git a/docs/release-notes/2122.fix.md b/docs/release-notes/2122.fix.md
@@ -0,0 +1 @@
+Respect off-axis merge options in {func}`anndata.experimental.concat_on_disk` {user}`ilan-gold`
diff --git a/src/anndata/_core/merge.py b/src/anndata/_core/merge.py
@@ -143,11 +143,16 @@ def equal_dask_array(a, b) -> bool:
         return False
     if isinstance(b, DaskArray) and tokenize(a) == tokenize(b):
         return True
-    if isinstance(a._meta, CSMatrix):
+    if isinstance(a._meta, np.ndarray):
+        return da.equal(a, b, where=~(da.isnan(a) & da.isnan(b))).all().compute()
+    if a.chunksize == b.chunksize and isinstance(
+        a._meta, CupySparseMatrix | CSMatrix | CSArray
+    ):
         # TODO: Maybe also do this in the other case?
         return da.map_blocks(equal, a, b, drop_axis=(0, 1)).all()
-    else:
-        return da.equal(a, b, where=~(da.isnan(a) == da.isnan(b))).all()
+    msg = "Misaligned chunks detected when checking for merge equality of dask arrays.  Reading full arrays into memory."
+    warn(msg, UserWarning, stacklevel=3)
+    return equal(a.compute(), b.compute())
 
 
 @equal.register(np.ndarray)
diff --git a/src/anndata/experimental/merge.py b/src/anndata/experimental/merge.py
@@ -27,7 +27,7 @@
 from .._core.sparse_dataset import BaseCompressedSparseDataset, sparse_dataset
 from .._io.specs import read_elem, write_elem
 from ..compat import H5Array, H5Group, ZarrArray, ZarrGroup
-from . import read_dispatched
+from . import read_dispatched, read_elem_lazy
 
 if TYPE_CHECKING:
     from collections.abc import Callable, Collection, Iterable, Sequence
@@ -173,7 +173,7 @@ def write_concat_dense(  # noqa: PLR0917
     output_path: ZarrGroup | H5Group,
     axis: Literal[0, 1] = 0,
     reindexers: Reindexer | None = None,
-    fill_value=None,
+    fill_value: Any = None,
 ):
     """
     Writes the concatenation of given dense arrays to disk using dask.
@@ -206,7 +206,7 @@ def write_concat_sparse(  # noqa: PLR0917
     max_loaded_elems: int,
     axis: Literal[0, 1] = 0,
     reindexers: Reindexer | None = None,
-    fill_value=None,
+    fill_value: Any = None,
 ):
     """
     Writes and concatenates sparse datasets into a single output dataset.
@@ -246,20 +246,20 @@ def write_concat_sparse(  # noqa: PLR0917
 
 
 def _write_concat_mappings(  # noqa: PLR0913, PLR0917
-    mappings,
+    mappings: Collection[dict],
     output_group: ZarrGroup | H5Group,
-    keys,
-    path,
-    max_loaded_elems,
-    axis=0,
-    index=None,
-    reindexers=None,
-    fill_value=None,
+    keys: Collection[str],
+    output_path: str | Path,
+    max_loaded_elems: int,
+    axis: Literal[0, 1] = 0,
+    index: pd.Index = None,
+    reindexers: list[Reindexer] | None = None,
+    fill_value: Any = None,
 ):
     """
     Write a list of mappings to a zarr/h5 group.
     """
-    mapping_group = output_group.create_group(path)
+    mapping_group = output_group.create_group(output_path)
     mapping_group.attrs.update({
         "encoding-type": "dict",
         "encoding-version": "0.1.0",
@@ -280,13 +280,13 @@ def _write_concat_mappings(  # noqa: PLR0913, PLR0917
 
 def _write_concat_arrays(  # noqa: PLR0913, PLR0917
     arrays: Sequence[ZarrArray | H5Array | BaseCompressedSparseDataset],
-    output_group,
-    output_path,
-    max_loaded_elems,
-    axis=0,
-    reindexers=None,
-    fill_value=None,
-    join="inner",
+    output_group: ZarrGroup | H5Group,
+    output_path: str | Path,
+    max_loaded_elems: int,
+    axis: Literal[0, 1] = 0,
+    reindexers: list[Reindexer] | None = None,
+    fill_value: Any = None,
+    join: Literal["inner", "outer"] = "inner",
 ):
     init_elem = arrays[0]
     init_type = type(init_elem)
@@ -324,14 +324,14 @@ def _write_concat_arrays(  # noqa: PLR0913, PLR0917
 
 def _write_concat_sequence(  # noqa: PLR0913, PLR0917
     arrays: Sequence[pd.DataFrame | BaseCompressedSparseDataset | H5Array | ZarrArray],
-    output_group,
-    output_path,
-    max_loaded_elems,
-    axis=0,
-    index=None,
-    reindexers=None,
-    fill_value=None,
-    join="inner",
+    output_group: ZarrGroup | H5Group,
+    output_path: str | Path,
+    max_loaded_elems: int,
+    axis: Literal[0, 1] = 0,
+    index: pd.Index = None,
+    reindexers: list[Reindexer] | None = None,
+    fill_value: Any = None,
+    join: Literal["inner", "outer"] = "inner",
 ):
     """
     array, dataframe, csc_matrix, csc_matrix
@@ -376,17 +376,27 @@ def _write_concat_sequence(  # noqa: PLR0913, PLR0917
         raise NotImplementedError(msg)
 
 
-def _write_alt_mapping(groups, output_group, alt_axis_name, alt_indices, merge):
-    alt_mapping = merge([read_as_backed(g[alt_axis_name]) for g in groups])
-    # If its empty, we need to write an empty dataframe with the correct index
-    if not alt_mapping:
-        alt_df = pd.DataFrame(index=alt_indices)
-        write_elem(output_group, alt_axis_name, alt_df)
-    else:
-        write_elem(output_group, alt_axis_name, alt_mapping)
+def _write_alt_mapping(
+    groups: Collection[H5Group, ZarrGroup],
+    output_group: ZarrGroup | H5Group,
+    alt_axis_name: Literal["obs", "var"],
+    merge: Callable,
+    reindexers: list[Reindexer],
+):
+    alt_mapping = merge([
+        {k: r(read_elem(v), axis=0) for k, v in dict(g[f"{alt_axis_name}m"]).items()}
+        for r, g in zip(reindexers, groups, strict=True)
+    ])
+    write_elem(output_group, f"{alt_axis_name}m", alt_mapping)
 
 
-def _write_alt_annot(groups, output_group, alt_axis_name, alt_indices, merge):
+def _write_alt_annot(
+    groups: Collection[H5Group, ZarrGroup],
+    output_group: ZarrGroup | H5Group,
+    alt_axis_name: Literal["obs", "var"],
+    alt_indices: pd.Index,
+    merge: Callable,
+):
     # Annotation for other axis
     alt_annot = merge_dataframes(
         [read_elem(g[alt_axis_name]) for g in groups], alt_indices, merge
@@ -395,7 +405,13 @@ def _write_alt_annot(groups, output_group, alt_axis_name, alt_indices, merge):
 
 
 def _write_axis_annot(  # noqa: PLR0917
-    groups, output_group, axis_name, concat_indices, label, label_col, join
+    groups: Collection[H5Group, ZarrGroup],
+    output_group: ZarrGroup | H5Group,
+    axis_name: Literal["obs", "var"],
+    concat_indices: pd.Index,
+    label: str,
+    label_col: str,
+    join: Literal["inner", "outer"],
 ):
     concat_annot = pd.concat(
         unify_dtypes(read_elem(g[axis_name]) for g in groups),
@@ -408,6 +424,23 @@ def _write_axis_annot(  # noqa: PLR0917
     write_elem(output_group, axis_name, concat_annot)
 
 
+def _write_alt_pairwise(
+    groups: Collection[H5Group, ZarrGroup],
+    output_group: ZarrGroup | H5Group,
+    alt_axis_name: Literal["obs", "var"],
+    merge: Callable,
+    reindexers: list[Reindexer],
+):
+    alt_pairwise = merge([
+        {
+            k: r(r(read_elem_lazy(v), axis=0), axis=1)
+            for k, v in dict(g[f"{alt_axis_name}p"]).items()
+        }
+        for r, g in zip(reindexers, groups, strict=True)
+    ])
+    write_elem(output_group, f"{alt_axis_name}p", alt_pairwise)
+
+
 def concat_on_disk(  # noqa: PLR0912, PLR0913, PLR0915
     in_files: Collection[PathLike[str] | str] | Mapping[str, PathLike[str] | str],
     out_file: PathLike[str] | str,
@@ -490,7 +523,8 @@ def concat_on_disk(  # noqa: PLR0912, PLR0913, PLR0915
         DataFrames are padded with missing values.
     pairwise
         Whether pairwise elements along the concatenated dimension should be included.
-        This is False by default, since the resulting arrays are often not meaningful.
+        This is False by default, since the resulting arrays are often not meaningful, and raises {class}`NotImplementedError` when True.
+        If you are interested in this feature, please open an issue.
 
     Notes
     -----
@@ -634,7 +668,10 @@ def concat_on_disk(  # noqa: PLR0912, PLR0913, PLR0915
     _write_alt_annot(groups, output_group, alt_axis_name, alt_index, merge)
 
     # Write {alt_axis_name}m
-    _write_alt_mapping(groups, output_group, alt_axis_name, alt_index, merge)
+    _write_alt_mapping(groups, output_group, alt_axis_name, merge, reindexers)
+
+    # Write {alt_axis_name}p
+    _write_alt_pairwise(groups, output_group, alt_axis_name, merge, reindexers)
 
     # Write X
 
diff --git a/tests/test_concatenate_disk.py b/tests/test_concatenate_disk.py
@@ -9,6 +9,7 @@
 from scipy import sparse
 
 from anndata import AnnData, concat
+from anndata._core import merge
 from anndata._core.merge import _resolve_axis
 from anndata.experimental.merge import as_group, concat_on_disk
 from anndata.io import read_elem, write_elem
@@ -31,6 +32,11 @@
 )
 
 
+@pytest.fixture(params=list(merge.MERGE_STRATEGIES.keys()))
+def merge_strategy(request):
+    return request.param
+
+
 @pytest.fixture(params=[0, 1])
 def axis(request) -> Literal[0, 1]:
     return request.param
@@ -86,16 +92,17 @@ def assert_eq_concat_on_disk(
     file_format: Literal["zarr", "h5ad"],
     max_loaded_elems: int | None = None,
     *args,
+    merge_strategy: merge.StrategiesLiteral | None = None,
     **kwargs,
 ):
     # create one from the concat function
-    res1 = concat(adatas, *args, **kwargs)
+    res1 = concat(adatas, *args, merge=merge_strategy, **kwargs)
     # create one from the on disk concat function
     paths = _adatas_to_paths(adatas, tmp_path, file_format)
     out_name = tmp_path / f"out.{file_format}"
     if max_loaded_elems is not None:
         kwargs["max_loaded_elems"] = max_loaded_elems
-    concat_on_disk(paths, out_name, *args, **kwargs)
+    concat_on_disk(paths, out_name, *args, merge=merge_strategy, **kwargs)
     res2 = read_elem(as_group(out_name, mode="r"))
     assert_equal(res1, res2, exact=False)
 
@@ -112,6 +119,7 @@ def get_array_type(array_type, axis):
 
 
 @pytest.mark.parametrize("reindex", [True, False], ids=["reindex", "no_reindex"])
+@pytest.mark.filterwarnings("ignore:Misaligned chunks detected")
 def test_anndatas(
     *,
     axis: Literal[0, 1],
@@ -121,6 +129,7 @@ def test_anndatas(
     max_loaded_elems: int,
     file_format: Literal["zarr", "h5ad"],
     reindex: bool,
+    merge_strategy: merge.StrategiesLiteral,
 ):
     _, off_axis_name = _resolve_axis(1 - axis)
     random_axes = {0, 1} if reindex else {axis}
@@ -159,6 +168,7 @@ def test_anndatas(
         max_loaded_elems,
         axis=axis,
         join=join_type,
+        merge_strategy=merge_strategy,
     )
 
 

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	+Respect off-axis merge options in {func}`anndata.experimental.concat_on_disk` {user}`ilan-gold`