Merge branch 'main' into html_rep

katosh · web-flow · commit e486c3c8f619 · 2026-04-28T14:38:48.000-07:00
diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
@@ -64,4 +64,4 @@ jobs:
         working-directory: ${{ env.ASV_DIR }}
         run: |
           asv machine --yes
-          asv run --quick --show-stderr --verbose
+          asv run --dry-run --quick --show-stderr --verbose HEAD^!
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -3,7 +3,7 @@ ci:
 
 repos:
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.15.10
+    rev: v0.15.12
     hooks:
       - id: ruff-check
         args: ["--fix"]
@@ -13,7 +13,7 @@ repos:
         id: ruff
         args: ["--preview", "--select=PLR0917"]
   - repo: https://github.com/biomejs/pre-commit
-    rev: v2.4.11
+    rev: v2.4.13
     hooks:
       - id: biome-format
   - repo: https://github.com/ComPWA/taplo-pre-commit
diff --git a/benchmarks/benchmarks/sparse_dataset.py b/benchmarks/benchmarks/sparse_dataset.py
@@ -1,8 +1,10 @@
 from __future__ import annotations
 
 from types import MappingProxyType
+from typing import TYPE_CHECKING
 
 import numpy as np
+import pandas as pd
 import zarr
 from dask.array.core import Array as DaskArray
 from scipy import sparse
@@ -12,6 +14,9 @@
 from anndata._io.specs import write_elem
 from anndata.experimental import read_elem_lazy
 
+if TYPE_CHECKING:
+    from typing import Literal
+
 
 def make_alternating_mask(n):
     mask_alternating = np.ones(10_000, dtype=bool)
@@ -79,9 +84,12 @@ def peakmem_getitem_adata(self, *_):
             res.compute()
 
 
-class SparseCSRDask:
+class SparseCSRDaskConcat:
     filepath = "data.zarr"
 
+    params = (["inner", "outer"], [0, -1])
+    param_names = ("join", "fill_value")
+
     def setup_cache(self):
         X = sparse.random(
             10_000,
@@ -93,18 +101,59 @@ def setup_cache(self):
         g = zarr.group(self.filepath)
         write_elem(g, "X", X)
 
-    def setup(self):
+    def setup(self, *_):
         self.group = zarr.group(self.filepath)
-        self.adata = AnnData(X=read_elem_lazy(self.group["X"]))
+        self.adatas = [
+            AnnData(
+                var=pd.DataFrame(
+                    index=[
+                        f"gene_{j}{f'_{i}' if (j % 500 == 0) else ''}"
+                        for j in range(10_000)
+                    ]
+                ),
+                X=read_elem_lazy(self.group["X"]),
+            )
+            for i in range(5)
+        ]
+
+    def time_concat(self, join: Literal["inner", "outer"], fill_value: Literal[0, -1]):
+        concat(self.adatas, join=join, fill_value=fill_value)
+
+    def peakmem_concat(
+        self, join: Literal["inner", "outer"], fill_value: Literal[0, -1]
+    ):
+        concat(self.adatas, join=join, fill_value=fill_value)
+
+    def time_concat_with_mem(
+        self, join: Literal["inner", "outer"], fill_value: Literal[0, -1]
+    ):
+        concat(self.adatas, join=join, fill_value=fill_value).to_memory()
+
+    def peakmem_concat_with_mem(
+        self, join: Literal["inner", "outer"], fill_value: Literal[0, -1]
+    ):
+        concat(self.adatas, join=join, fill_value=fill_value).to_memory()
 
-    def time_concat(self):
-        concat([self.adata for i in range(100)])
 
-    def peakmem_concat(self):
-        concat([self.adata for i in range(100)])
+class SparseCSRDask:
+    filepath = "data.zarr"
+
+    def setup_cache(self):
+        X = sparse.random(
+            10_000,
+            10_000,
+            density=0.01,
+            format="csr",
+            random_state=np.random.default_rng(42),
+        )
+        g = zarr.group(self.filepath)
+        write_elem(g, "X", X)
+
+    def setup(self, *_):
+        self.group = zarr.group(self.filepath)
 
-    def time_read(self):
+    def time_read(self, *_):
         AnnData(X=read_elem_lazy(self.group["X"]))
 
-    def peakmem_read(self):
+    def peakmem_read(self, *_):
         AnnData(X=read_elem_lazy(self.group["X"]))
diff --git a/docs/concatenation.rst b/docs/concatenation.rst
@@ -26,6 +26,7 @@ Let's start off with an example:
     AnnData object with n_obs × n_vars = 700 × 765
         obs: 'bulk_labels', 'n_genes', 'percent_mito', 'n_counts', 'S_score', 'G2M_score', 'phase', 'louvain'
         var: 'n_counts', 'means', 'dispersions', 'dispersions_norm', 'highly_variable'
+        uns: 'bulk_labels_colors', 'louvain', 'louvain_colors', 'neighbors', 'pca', 'rank_genes_groups'
         obsm: 'X_pca', 'X_umap'
         varm: 'PCs'
         obsp: ...
@@ -164,9 +165,9 @@ First, our example case:
     >>> blobs
     AnnData object with n_obs × n_vars = 640 × 30
         obs: 'blobs'
+        uns: 'pca'
         obsm: 'X_pca'
         varm: 'PCs'
-        uns: 'pca'
 
 Now we will split this object by the categorical `"blobs"` and recombine it to illustrate different merge strategies.
 
@@ -180,9 +181,9 @@ Now we will split this object by the categorical `"blobs"` and recombine it to i
     >>> adatas[0]
     AnnData object with n_obs × n_vars = 128 × 30
         obs: 'blobs'
+        uns: 'pca'
         obsm: 'X_pca', 'qc'
         varm: 'PCs', '0_qc'
-        uns: 'pca'
 
 `adatas` is now a list of datasets with disjoint sets of observations and a common set of variables.
 Each object has had QC metrics computed, with observation-wise metrics stored under `"qc"` in `.obsm`, and variable-wise metrics stored with a unique key for each subset.
diff --git a/docs/release-notes/2395.perf.md b/docs/release-notes/2395.perf.md
@@ -0,0 +1 @@
+Accelerate outer joins on dask-sparse matrices with unchunked minor axes in {func}`anndata.concat` {user}`ilan-gold`
diff --git a/docs/release-notes/2399.fix.md b/docs/release-notes/2399.fix.md
@@ -0,0 +1 @@
+Disallow {meth}`anndata.AnnData.transpose` when `X` or `layers` contains {class}`h5py.Dataset`, {class}`zarr.Array` ,{class}`anndata.abc.CSRDataset`, or {class}`anndata.abc.CSCDataset` {user}`ilan-gold`.
diff --git a/docs/release-notes/2406.fix.md b/docs/release-notes/2406.fix.md
@@ -0,0 +1 @@
+Fix {meth}`anndata.AnnData.copy` so that it provides an informative error when trying to `copy` and object that contains {class}`h5py.Dataset`, {class}`zarr.Array`, {class}`anndata.abc.CSRDataset`, or {class}`anndata.abc.CSCDataset` {user}`ilan-gold`
diff --git a/pyproject.toml b/pyproject.toml
@@ -174,7 +174,7 @@ filterwarnings_when_strict = [
     "default::dask.array.core.PerformanceWarning",
     "default:anndata will no longer support zarr v2:DeprecationWarning",
     "default:Consolidated metadata is:UserWarning",
-    "default:.*Structured:zarr.core.dtype.common.UnstableSpecificationWarning",
+    "default:.*Struct:zarr.core.dtype.common.UnstableSpecificationWarning",
     "default:.*FixedLengthUTF32:zarr.core.dtype.common.UnstableSpecificationWarning",
     "default:Automatic shard shape inference is experimental",
     "default:Writing zarr v2:UserWarning",
diff --git a/src/anndata/_core/aligned_mapping.py b/src/anndata/_core/aligned_mapping.py
@@ -393,12 +393,12 @@ class AlignedMappingProperty[T: AlignedMapping](property):
     The actual data is stored as `f'_{self.name}'` in the parent object.
     """
 
-    name: str
-    """Name of the attribute in the parent object."""
     cls: type[T]
     """Concrete type that will be constructed."""
     axis: Literal[0, 1] | None = None
     """Axis of the parent to align to."""
+    name: str | None = None
+    """Name of the attribute in the parent object."""
 
     def construct(self, obj: AnnData, *, store: MutableMapping[str, Value]) -> T:
         if self.axis is None:
@@ -414,6 +414,9 @@ def fake(): ...
         fake.__annotations__ = {"return": self.cls._actual_class | self.cls._view_class}
         return fake
 
+    def __set_name__(self, owner: AnnData, name: str):
+        self.name = name
+
     def __get__(self, obj: None | AnnData, objtype: type | None = None) -> T:
         if obj is None:
             # When accessed from the class, e.g. via `AnnData.obs`,
diff --git a/src/anndata/_core/anndata.py b/src/anndata/_core/anndata.py
@@ -65,6 +65,7 @@
     from scipy import sparse
     from zarr.storage import StoreLike
 
+    from anndata._types import AnnDataElem
     from anndata.typing import RWAble
 
     from .._types import ReduceFunc
@@ -283,12 +284,6 @@ def _init_as_view(
         oidx: _Index1DNorm | int | np.integer,
         vidx: _Index1DNorm | int | np.integer,
     ):
-        if adata_ref.isbacked and adata_ref.is_view:
-            msg = (
-                "Currently, you cannot index repeatedly into a backed AnnData, "
-                "that is, you cannot make a view of a view."
-            )
-            raise ValueError(msg)
         self._is_view = True
         if isinstance(oidx, int | np.integer):
             if not (-adata_ref.n_obs <= oidx < adata_ref.n_obs):
@@ -705,9 +700,7 @@ def X(self, value: _XDataType | None):
     def X(self):
         self.X = None
 
-    layers: AlignedMappingProperty[Layers | LayersView] = AlignedMappingProperty(
-        "layers", Layers
-    )
+    layers: AlignedMappingProperty[Layers | LayersView] = AlignedMappingProperty(Layers)
     """\
     Dictionary-like object with values of the same dimensions as :attr:`X`.
 
@@ -923,7 +916,7 @@ def uns(self):
         self.uns = OrderedDict()
 
     obsm: AlignedMappingProperty[AxisArrays | AxisArraysView] = AlignedMappingProperty(
-        "obsm", AxisArrays, 0
+        AxisArrays, 0
     )
     """\
     Multi-dimensional annotation of observations
@@ -935,7 +928,7 @@ def uns(self):
     """
 
     varm: AlignedMappingProperty[AxisArrays | AxisArraysView] = AlignedMappingProperty(
-        "varm", AxisArrays, 1
+        AxisArrays, 1
     )
     """\
     Multi-dimensional annotation of variables/features
@@ -947,7 +940,7 @@ def uns(self):
     """
 
     obsp: AlignedMappingProperty[PairwiseArrays | PairwiseArraysView] = (
-        AlignedMappingProperty("obsp", PairwiseArrays, 0)
+        AlignedMappingProperty(PairwiseArrays, 0)
     )
     """\
     Pairwise annotation of observations,
@@ -959,7 +952,7 @@ def uns(self):
     """
 
     varp: AlignedMappingProperty[PairwiseArrays | PairwiseArraysView] = (
-        AlignedMappingProperty("varp", PairwiseArrays, 1)
+        AlignedMappingProperty(PairwiseArrays, 1)
     )
     """\
     Pairwise annotation of variables/features,
@@ -1289,6 +1282,12 @@ def transpose(self) -> AnnData:
                 "which is currently not implemented. Call `.copy()` before transposing."
             )
             raise ValueError(msg)
+        if any(
+            isinstance(elem, ZarrArray | BaseCompressedSparseDataset | h5py.Dataset)
+            for elem in (self.X, *self.layers.values())
+        ):
+            msg = "Cannot transpose anndata object that has raw zarr arrays or h5py arrays backing X or layers"
+            raise ValueError(msg)
 
         return AnnData(
             X=_safe_transpose(X) if X is not None else None,
@@ -1464,9 +1463,32 @@ def to_memory(self, *, copy: bool = False) -> AnnData:
 
         return AnnData(**new)
 
+    def _has_raw_zarr_or_h5_array(self) -> bool:
+        def predicate(
+            elem: RWAble,
+            *,
+            accumulate: bool,
+            attr_name: AnnDataElem | None = None,
+        ):
+            if isinstance(elem, MutableMapping):
+                return accumulate or any(
+                    isinstance(
+                        v, ZarrArray | BaseCompressedSparseDataset | h5py.Dataset
+                    )
+                    for v in elem.values()
+                )
+            return accumulate or isinstance(
+                elem, ZarrArray | BaseCompressedSparseDataset | h5py.Dataset
+            )
+
+        return self._reduce(predicate, init=False)
+
     def copy(self, filename: PathLike[str] | str | None = None) -> AnnData:
         """Full copy, optionally on disk."""
         if not self.isbacked:
+            if self._has_raw_zarr_or_h5_array():
+                msg = "Copy is not implemented for anndatas which have backing raw h5 (not in backed mode) or zarr arrays"
+                raise NotImplementedError(msg)
             if self.is_view and self._has_X():
                 # TODO: How do I unambiguously check if this is a copy?
                 # Subsetting this way means we don’t have to have a view type
@@ -1542,7 +1564,7 @@ def predicate(  # noqa: PLR0911
             elem: RWAble,
             *,
             accumulate: bool,
-            attr_name: str | None = None,  # TODO: type
+            attr_name: AnnDataElem | None = None,
         ):
             if elem is None:
                 return accumulate
diff --git a/src/anndata/_core/merge.py b/src/anndata/_core/merge.py
@@ -583,6 +583,29 @@ def _apply_to_df_like(self, el: pd.DataFrame | Dataset2D, *, axis, fill_value=No
     def _apply_to_dask_array(self, el: DaskArray, *, axis, fill_value=None):
         import dask.array as da
 
+        indexer = self.idx
+        is_outer = any(indexer == -1)
+        # Fast path for the majority of sparse matrices whose minor-axis is unchunked and is being reindexed.
+        # This prevents 0's from being stored explicitly in the sparse matrices when outer joining, for example (see below).
+        if (
+            is_sparse_sub := isinstance(el._meta, CSArray | CSMatrix)
+            and el.chunksize[minor_axis := int(el._meta.format == "csr")]
+            == el.shape[minor_axis]
+            and axis == minor_axis
+            and is_outer
+        ):
+            return el.map_blocks(
+                partial(
+                    self._apply_to_sparse,
+                    axis=axis,
+                    fill_value=fill_value,
+                    keep_format=True,
+                ),
+                chunks=(el.chunks[0], len(self.new_idx))
+                if minor_axis == 1
+                else (len(self.new_idx), el.chunks[1]),
+                meta=el._meta,
+            )
         if fill_value is None:
             fill_value = default_fill_value([el])
         shape = list(el.shape)
@@ -591,12 +614,11 @@ def _apply_to_dask_array(self, el: DaskArray, *, axis, fill_value=None):
             shape[axis] = len(self.new_idx)
             return da.broadcast_to(fill_value, tuple(shape))
 
-        indexer = self.idx
         sub_el = _subset(el, make_slice(indexer, axis, len(shape)))
 
-        if any(indexer == -1):
+        if is_outer:
             # TODO: Remove this condition once https://github.com/dask/dask/pull/12078 is released
-            if isinstance(sub_el._meta, CSArray | CSMatrix) and np.isscalar(fill_value):
+            if is_sparse_sub and np.isscalar(fill_value):
                 fill_value = np.array([[fill_value]])
             sub_el[make_slice(indexer == -1, axis, len(shape))] = fill_value
 
@@ -658,7 +680,7 @@ def _apply_to_array_api(
         return xp.where(mask, fv, taken)
 
     def _apply_to_sparse(  # noqa: PLR0912
-        self, el: CSMatrix | CSArray, *, axis, fill_value=None
+        self, el: CSMatrix | CSArray, *, axis, fill_value=None, keep_format: bool = True
     ) -> CSMatrix:
         if isinstance(el, CupySparseMatrix):
             from cupyx.scipy import sparse
@@ -730,7 +752,8 @@ def _apply_to_sparse(  # noqa: PLR0912
 
         if fill_idxer is not None:
             out[fill_idxer] = fill_value
-
+        if keep_format:
+            out = out.tocsr() if el.format == "csr" else out.tocsc()
         return out
 
     def _apply_to_awkward(self, el: AwkArray, *, axis, fill_value=None):
diff --git a/src/anndata/_core/raw.py b/src/anndata/_core/raw.py
@@ -116,7 +116,7 @@ def n_obs(self) -> int:
         return self._n_obs
 
     varm: AlignedMappingProperty[AxisArrays | AxisArraysView] = AlignedMappingProperty(
-        "varm", AxisArrays, 1
+        AxisArrays, 1
     )
 
     @property
diff --git a/src/anndata/utils.py b/src/anndata/utils.py
diff --git a/tests/test_backed_dense.py b/tests/test_backed_dense.py
diff --git a/tests/test_backed_hdf5.py b/tests/test_backed_hdf5.py
diff --git a/tests/test_backed_sparse.py b/tests/test_backed_sparse.py
diff --git a/tests/test_base.py b/tests/test_base.py
diff --git a/tests/test_concatenate.py b/tests/test_concatenate.py

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	+Accelerate outer joins on dask-sparse matrices with unchunked minor axes in {func}`anndata.concat` {user}`ilan-gold`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	+Disallow {meth}`anndata.AnnData.transpose` when `X` or `layers` contains {class}`h5py.Dataset`, {class}`zarr.Array` ,{class}`anndata.abc.CSRDataset`, or {class}`anndata.abc.CSCDataset` {user}`ilan-gold`.
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	+Fix {meth}`anndata.AnnData.copy` so that it provides an informative error when trying to `copy` and object that contains {class}`h5py.Dataset`, {class}`zarr.Array`, {class}`anndata.abc.CSRDataset`, or {class}`anndata.abc.CSCDataset` {user}`ilan-gold`