Skip to content

Commit e486c3c

Browse files
authored
Merge branch 'main' into html_rep
2 parents d274fc7 + b721425 commit e486c3c

18 files changed

Lines changed: 280 additions & 63 deletions

.github/workflows/benchmark.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,4 +64,4 @@ jobs:
6464
working-directory: ${{ env.ASV_DIR }}
6565
run: |
6666
asv machine --yes
67-
asv run --quick --show-stderr --verbose
67+
asv run --dry-run --quick --show-stderr --verbose HEAD^!

.pre-commit-config.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ ci:
33

44
repos:
55
- repo: https://github.com/astral-sh/ruff-pre-commit
6-
rev: v0.15.10
6+
rev: v0.15.12
77
hooks:
88
- id: ruff-check
99
args: ["--fix"]
@@ -13,7 +13,7 @@ repos:
1313
id: ruff
1414
args: ["--preview", "--select=PLR0917"]
1515
- repo: https://github.com/biomejs/pre-commit
16-
rev: v2.4.11
16+
rev: v2.4.13
1717
hooks:
1818
- id: biome-format
1919
- repo: https://github.com/ComPWA/taplo-pre-commit

benchmarks/benchmarks/sparse_dataset.py

Lines changed: 58 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
11
from __future__ import annotations
22

33
from types import MappingProxyType
4+
from typing import TYPE_CHECKING
45

56
import numpy as np
7+
import pandas as pd
68
import zarr
79
from dask.array.core import Array as DaskArray
810
from scipy import sparse
@@ -12,6 +14,9 @@
1214
from anndata._io.specs import write_elem
1315
from anndata.experimental import read_elem_lazy
1416

17+
if TYPE_CHECKING:
18+
from typing import Literal
19+
1520

1621
def make_alternating_mask(n):
1722
mask_alternating = np.ones(10_000, dtype=bool)
@@ -79,9 +84,12 @@ def peakmem_getitem_adata(self, *_):
7984
res.compute()
8085

8186

82-
class SparseCSRDask:
87+
class SparseCSRDaskConcat:
8388
filepath = "data.zarr"
8489

90+
params = (["inner", "outer"], [0, -1])
91+
param_names = ("join", "fill_value")
92+
8593
def setup_cache(self):
8694
X = sparse.random(
8795
10_000,
@@ -93,18 +101,59 @@ def setup_cache(self):
93101
g = zarr.group(self.filepath)
94102
write_elem(g, "X", X)
95103

96-
def setup(self):
104+
def setup(self, *_):
97105
self.group = zarr.group(self.filepath)
98-
self.adata = AnnData(X=read_elem_lazy(self.group["X"]))
106+
self.adatas = [
107+
AnnData(
108+
var=pd.DataFrame(
109+
index=[
110+
f"gene_{j}{f'_{i}' if (j % 500 == 0) else ''}"
111+
for j in range(10_000)
112+
]
113+
),
114+
X=read_elem_lazy(self.group["X"]),
115+
)
116+
for i in range(5)
117+
]
118+
119+
def time_concat(self, join: Literal["inner", "outer"], fill_value: Literal[0, -1]):
120+
concat(self.adatas, join=join, fill_value=fill_value)
121+
122+
def peakmem_concat(
123+
self, join: Literal["inner", "outer"], fill_value: Literal[0, -1]
124+
):
125+
concat(self.adatas, join=join, fill_value=fill_value)
126+
127+
def time_concat_with_mem(
128+
self, join: Literal["inner", "outer"], fill_value: Literal[0, -1]
129+
):
130+
concat(self.adatas, join=join, fill_value=fill_value).to_memory()
131+
132+
def peakmem_concat_with_mem(
133+
self, join: Literal["inner", "outer"], fill_value: Literal[0, -1]
134+
):
135+
concat(self.adatas, join=join, fill_value=fill_value).to_memory()
99136

100-
def time_concat(self):
101-
concat([self.adata for i in range(100)])
102137

103-
def peakmem_concat(self):
104-
concat([self.adata for i in range(100)])
138+
class SparseCSRDask:
139+
filepath = "data.zarr"
140+
141+
def setup_cache(self):
142+
X = sparse.random(
143+
10_000,
144+
10_000,
145+
density=0.01,
146+
format="csr",
147+
random_state=np.random.default_rng(42),
148+
)
149+
g = zarr.group(self.filepath)
150+
write_elem(g, "X", X)
151+
152+
def setup(self, *_):
153+
self.group = zarr.group(self.filepath)
105154

106-
def time_read(self):
155+
def time_read(self, *_):
107156
AnnData(X=read_elem_lazy(self.group["X"]))
108157

109-
def peakmem_read(self):
158+
def peakmem_read(self, *_):
110159
AnnData(X=read_elem_lazy(self.group["X"]))

docs/concatenation.rst

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ Let's start off with an example:
2626
AnnData object with n_obs × n_vars = 700 × 765
2727
obs: 'bulk_labels', 'n_genes', 'percent_mito', 'n_counts', 'S_score', 'G2M_score', 'phase', 'louvain'
2828
var: 'n_counts', 'means', 'dispersions', 'dispersions_norm', 'highly_variable'
29+
uns: 'bulk_labels_colors', 'louvain', 'louvain_colors', 'neighbors', 'pca', 'rank_genes_groups'
2930
obsm: 'X_pca', 'X_umap'
3031
varm: 'PCs'
3132
obsp: ...
@@ -164,9 +165,9 @@ First, our example case:
164165
>>> blobs
165166
AnnData object with n_obs × n_vars = 640 × 30
166167
obs: 'blobs'
168+
uns: 'pca'
167169
obsm: 'X_pca'
168170
varm: 'PCs'
169-
uns: 'pca'
170171

171172
Now we will split this object by the categorical `"blobs"` and recombine it to illustrate different merge strategies.
172173

@@ -180,9 +181,9 @@ Now we will split this object by the categorical `"blobs"` and recombine it to i
180181
>>> adatas[0]
181182
AnnData object with n_obs × n_vars = 128 × 30
182183
obs: 'blobs'
184+
uns: 'pca'
183185
obsm: 'X_pca', 'qc'
184186
varm: 'PCs', '0_qc'
185-
uns: 'pca'
186187

187188
`adatas` is now a list of datasets with disjoint sets of observations and a common set of variables.
188189
Each object has had QC metrics computed, with observation-wise metrics stored under `"qc"` in `.obsm`, and variable-wise metrics stored with a unique key for each subset.

docs/release-notes/2395.perf.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Accelerate outer joins on dask-sparse matrices with unchunked minor axes in {func}`anndata.concat` {user}`ilan-gold`

docs/release-notes/2399.fix.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Disallow {meth}`anndata.AnnData.transpose` when `X` or `layers` contains {class}`h5py.Dataset`, {class}`zarr.Array` ,{class}`anndata.abc.CSRDataset`, or {class}`anndata.abc.CSCDataset` {user}`ilan-gold`.

docs/release-notes/2406.fix.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Fix {meth}`anndata.AnnData.copy` so that it provides an informative error when trying to `copy` and object that contains {class}`h5py.Dataset`, {class}`zarr.Array`, {class}`anndata.abc.CSRDataset`, or {class}`anndata.abc.CSCDataset` {user}`ilan-gold`

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -174,7 +174,7 @@ filterwarnings_when_strict = [
174174
"default::dask.array.core.PerformanceWarning",
175175
"default:anndata will no longer support zarr v2:DeprecationWarning",
176176
"default:Consolidated metadata is:UserWarning",
177-
"default:.*Structured:zarr.core.dtype.common.UnstableSpecificationWarning",
177+
"default:.*Struct:zarr.core.dtype.common.UnstableSpecificationWarning",
178178
"default:.*FixedLengthUTF32:zarr.core.dtype.common.UnstableSpecificationWarning",
179179
"default:Automatic shard shape inference is experimental",
180180
"default:Writing zarr v2:UserWarning",

src/anndata/_core/aligned_mapping.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -393,12 +393,12 @@ class AlignedMappingProperty[T: AlignedMapping](property):
393393
The actual data is stored as `f'_{self.name}'` in the parent object.
394394
"""
395395

396-
name: str
397-
"""Name of the attribute in the parent object."""
398396
cls: type[T]
399397
"""Concrete type that will be constructed."""
400398
axis: Literal[0, 1] | None = None
401399
"""Axis of the parent to align to."""
400+
name: str | None = None
401+
"""Name of the attribute in the parent object."""
402402

403403
def construct(self, obj: AnnData, *, store: MutableMapping[str, Value]) -> T:
404404
if self.axis is None:
@@ -414,6 +414,9 @@ def fake(): ...
414414
fake.__annotations__ = {"return": self.cls._actual_class | self.cls._view_class}
415415
return fake
416416

417+
def __set_name__(self, owner: AnnData, name: str):
418+
self.name = name
419+
417420
def __get__(self, obj: None | AnnData, objtype: type | None = None) -> T:
418421
if obj is None:
419422
# When accessed from the class, e.g. via `AnnData.obs`,

src/anndata/_core/anndata.py

Lines changed: 36 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@
6565
from scipy import sparse
6666
from zarr.storage import StoreLike
6767

68+
from anndata._types import AnnDataElem
6869
from anndata.typing import RWAble
6970

7071
from .._types import ReduceFunc
@@ -283,12 +284,6 @@ def _init_as_view(
283284
oidx: _Index1DNorm | int | np.integer,
284285
vidx: _Index1DNorm | int | np.integer,
285286
):
286-
if adata_ref.isbacked and adata_ref.is_view:
287-
msg = (
288-
"Currently, you cannot index repeatedly into a backed AnnData, "
289-
"that is, you cannot make a view of a view."
290-
)
291-
raise ValueError(msg)
292287
self._is_view = True
293288
if isinstance(oidx, int | np.integer):
294289
if not (-adata_ref.n_obs <= oidx < adata_ref.n_obs):
@@ -705,9 +700,7 @@ def X(self, value: _XDataType | None):
705700
def X(self):
706701
self.X = None
707702

708-
layers: AlignedMappingProperty[Layers | LayersView] = AlignedMappingProperty(
709-
"layers", Layers
710-
)
703+
layers: AlignedMappingProperty[Layers | LayersView] = AlignedMappingProperty(Layers)
711704
"""\
712705
Dictionary-like object with values of the same dimensions as :attr:`X`.
713706
@@ -923,7 +916,7 @@ def uns(self):
923916
self.uns = OrderedDict()
924917

925918
obsm: AlignedMappingProperty[AxisArrays | AxisArraysView] = AlignedMappingProperty(
926-
"obsm", AxisArrays, 0
919+
AxisArrays, 0
927920
)
928921
"""\
929922
Multi-dimensional annotation of observations
@@ -935,7 +928,7 @@ def uns(self):
935928
"""
936929

937930
varm: AlignedMappingProperty[AxisArrays | AxisArraysView] = AlignedMappingProperty(
938-
"varm", AxisArrays, 1
931+
AxisArrays, 1
939932
)
940933
"""\
941934
Multi-dimensional annotation of variables/features
@@ -947,7 +940,7 @@ def uns(self):
947940
"""
948941

949942
obsp: AlignedMappingProperty[PairwiseArrays | PairwiseArraysView] = (
950-
AlignedMappingProperty("obsp", PairwiseArrays, 0)
943+
AlignedMappingProperty(PairwiseArrays, 0)
951944
)
952945
"""\
953946
Pairwise annotation of observations,
@@ -959,7 +952,7 @@ def uns(self):
959952
"""
960953

961954
varp: AlignedMappingProperty[PairwiseArrays | PairwiseArraysView] = (
962-
AlignedMappingProperty("varp", PairwiseArrays, 1)
955+
AlignedMappingProperty(PairwiseArrays, 1)
963956
)
964957
"""\
965958
Pairwise annotation of variables/features,
@@ -1289,6 +1282,12 @@ def transpose(self) -> AnnData:
12891282
"which is currently not implemented. Call `.copy()` before transposing."
12901283
)
12911284
raise ValueError(msg)
1285+
if any(
1286+
isinstance(elem, ZarrArray | BaseCompressedSparseDataset | h5py.Dataset)
1287+
for elem in (self.X, *self.layers.values())
1288+
):
1289+
msg = "Cannot transpose anndata object that has raw zarr arrays or h5py arrays backing X or layers"
1290+
raise ValueError(msg)
12921291

12931292
return AnnData(
12941293
X=_safe_transpose(X) if X is not None else None,
@@ -1464,9 +1463,32 @@ def to_memory(self, *, copy: bool = False) -> AnnData:
14641463

14651464
return AnnData(**new)
14661465

1466+
def _has_raw_zarr_or_h5_array(self) -> bool:
1467+
def predicate(
1468+
elem: RWAble,
1469+
*,
1470+
accumulate: bool,
1471+
attr_name: AnnDataElem | None = None,
1472+
):
1473+
if isinstance(elem, MutableMapping):
1474+
return accumulate or any(
1475+
isinstance(
1476+
v, ZarrArray | BaseCompressedSparseDataset | h5py.Dataset
1477+
)
1478+
for v in elem.values()
1479+
)
1480+
return accumulate or isinstance(
1481+
elem, ZarrArray | BaseCompressedSparseDataset | h5py.Dataset
1482+
)
1483+
1484+
return self._reduce(predicate, init=False)
1485+
14671486
def copy(self, filename: PathLike[str] | str | None = None) -> AnnData:
14681487
"""Full copy, optionally on disk."""
14691488
if not self.isbacked:
1489+
if self._has_raw_zarr_or_h5_array():
1490+
msg = "Copy is not implemented for anndatas which have backing raw h5 (not in backed mode) or zarr arrays"
1491+
raise NotImplementedError(msg)
14701492
if self.is_view and self._has_X():
14711493
# TODO: How do I unambiguously check if this is a copy?
14721494
# Subsetting this way means we don’t have to have a view type
@@ -1542,7 +1564,7 @@ def predicate( # noqa: PLR0911
15421564
elem: RWAble,
15431565
*,
15441566
accumulate: bool,
1545-
attr_name: str | None = None, # TODO: type
1567+
attr_name: AnnDataElem | None = None,
15461568
):
15471569
if elem is None:
15481570
return accumulate

0 commit comments

Comments
 (0)