Skip to content

Commit e8d9742

Browse files
authored
Merge branch 'main' into pre-commit-ci-update-config
2 parents ba97a10 + a6123c0 commit e8d9742

13 files changed

Lines changed: 145 additions & 119 deletions

File tree

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ __pycache__/
1212
# Distribution / packaging
1313
/dist/
1414
/ci/min-deps.txt
15+
/ci/pre-deps.txt
1516
/requirements*.lock
1617
/.python-version
1718

docs/release-notes/2008.bugfix.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Lower bound `xarray` by `2025.06.01`. {class}`pandas.arrays.StringArray` was previously used as the in-memory `nullable-string-array` container in `xarray`, but due to {issue}`pydata/xarray#10419` now uses {class}`numpy.ndarray` with an object data type. {user}`ilan-gold`

hatch.toml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,10 +20,14 @@ extra-dependencies = [ "ipykernel" ]
2020
env-vars.UV_CONSTRAINT = "ci/constraints.txt"
2121
overrides.matrix.deps.env-vars = [
2222
{ if = [ "pre" ], key = "UV_PRERELEASE", value = "allow" },
23+
{ if = [ "pre" ], key = "UV_CONSTRAINT", value = "ci/pre-deps.txt" },
2324
{ if = [ "min" ], key = "UV_CONSTRAINT", value = "ci/constraints.txt ci/min-deps.txt" },
2425
]
2526
overrides.matrix.deps.pre-install-commands = [
2627
{ if = [ "min" ], value = "uv run ci/scripts/min-deps.py pyproject.toml --all-extras -o ci/min-deps.txt" },
28+
# To prevent situations like https://github.com/pydata/xarray/issues/10419 going forward
29+
{ if = [ "pre" ], value = "echo xarray @ git+https://github.com/pydata/xarray.git > ci/pre-deps.txt" },
30+
2731
]
2832
overrides.matrix.deps.python = [
2933
{ if = [ "min" ], value = "3.11" },

pyproject.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ Home-page = "https://github.com/scverse/anndata"
6060
dev = [
6161
# runtime dev version generation
6262
"hatch-vcs",
63-
"anndata[dev-doc,test]",
63+
"anndata[dev-doc]",
6464
]
6565
doc = [
6666
"sphinx>=8.2.1",
@@ -109,7 +109,7 @@ gpu = [ "cupy" ]
109109
cu12 = [ "cupy-cuda12x" ]
110110
cu11 = [ "cupy-cuda11x" ]
111111
# requests and aiohttp needed for zarr remote data
112-
lazy = [ "xarray>=2025.04.0", "aiohttp", "requests", "anndata[dask]" ]
112+
lazy = [ "xarray>=2025.06.1", "aiohttp", "requests", "anndata[dask]" ]
113113
# https://github.com/dask/dask/issues/11290
114114
# https://github.com/dask/dask/issues/11752
115115
dask = [ "dask[array]>=2023.5.1,!=2024.8.*,!=2024.9.*,<2025.2.0" ]

src/anndata/_core/xarray.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,26 @@
11
from __future__ import annotations
22

33
import warnings
4+
from functools import wraps
45

56
import pandas as pd
67

78
from ..compat import XDataArray, XDataset, XVariable
89

910

11+
def requires_xarray(func):
12+
@wraps(func)
13+
def wrapper(*args, **kwargs):
14+
try:
15+
import xarray # noqa: F401
16+
except ImportError as e:
17+
msg = "xarray is required to read dataframes lazily. Please install xarray."
18+
raise ImportError(msg) from e
19+
return func(*args, **kwargs)
20+
21+
return wrapper
22+
23+
1024
class Dataset2D(XDataset):
1125
"""
1226
A wrapper class meant to enable working with lazy dataframe data.
@@ -119,10 +133,18 @@ def __getitem__(self, idx) -> Dataset2D:
119133
return ret
120134

121135
def to_memory(self, *, copy=False) -> pd.DataFrame:
136+
# https://github.com/pydata/xarray/issues/10419
137+
non_nullable_string_cols = {
138+
col
139+
for col in self.columns
140+
if not self[col].attrs.get("is_nullable_string", False)
141+
}
122142
df = self.to_dataframe()
123143
index_key = self.attrs.get("indexing_key", None)
124144
if df.index.name != index_key and index_key is not None:
125145
df = df.set_index(index_key)
146+
for col in set(self.columns) - non_nullable_string_cols:
147+
df[col] = pd.array(self[col].data, dtype="string")
126148
df.index.name = None # matches old AnnData object
127149
return df
128150

src/anndata/_io/specs/lazy_methods.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212

1313
import anndata as ad
1414
from anndata._core.file_backing import filename, get_elem_name
15-
from anndata._core.xarray import Dataset2D
15+
from anndata._core.xarray import Dataset2D, requires_xarray
1616
from anndata.abc import CSCDataset, CSRDataset
1717
from anndata.compat import DaskArray, H5Array, H5Group, XDataArray, ZarrArray, ZarrGroup
1818

@@ -241,6 +241,9 @@ def _gen_xarray_dict_iterator_from_elems(
241241
attrs={
242242
"base_path_or_zarr_group": v.base_path_or_zarr_group,
243243
"elem_name": v.elem_name,
244+
"is_nullable_string": isinstance(v, MaskedArray)
245+
and v.dtype # CategoricalArray dtype access requires a read nad is not necessary here
246+
== np.dtype("O"),
244247
},
245248
)
246249
elif k == dim_name:
@@ -258,6 +261,7 @@ def _gen_xarray_dict_iterator_from_elems(
258261

259262
@_LAZY_REGISTRY.register_read(ZarrGroup, IOSpec("dataframe", "0.2.0"))
260263
@_LAZY_REGISTRY.register_read(H5Group, IOSpec("dataframe", "0.2.0"))
264+
@requires_xarray
261265
def read_dataframe(
262266
elem: H5Group | ZarrGroup,
263267
*,
@@ -297,6 +301,7 @@ def read_dataframe(
297301

298302
@_LAZY_REGISTRY.register_read(ZarrGroup, IOSpec("categorical", "0.2.0"))
299303
@_LAZY_REGISTRY.register_read(H5Group, IOSpec("categorical", "0.2.0"))
304+
@requires_xarray
300305
def read_categorical(
301306
elem: H5Group | ZarrGroup,
302307
*,
@@ -317,6 +322,7 @@ def read_categorical(
317322
)
318323

319324

325+
@requires_xarray
320326
def read_nullable(
321327
elem: H5Group | ZarrGroup,
322328
*,

src/anndata/experimental/backed/_io.py

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
from testing.anndata._doctest import doctest_needs
1414

1515
from ..._core.anndata import AnnData
16+
from ..._core.xarray import requires_xarray
1617
from ..._settings import settings
1718
from ...compat import ZarrGroup, is_zarr_v2
1819
from .. import read_dispatched
@@ -25,6 +26,7 @@
2526

2627

2728
@doctest_needs("xarray")
29+
@requires_xarray
2830
def read_lazy(
2931
store: PathLike[str] | str | MutableMapping | ZarrGroup | h5py.Dataset,
3032
*,
@@ -81,13 +83,6 @@ def read_lazy(
8183
AnnData object with n_obs × n_vars = 490 × 33452
8284
obs: 'donor_id', 'self_reported_ethnicity_ontology_term_id', 'organism_ontology_term_id'...
8385
"""
84-
try:
85-
import xarray # noqa: F401
86-
except ImportError as e:
87-
msg = (
88-
"xarray is required to use the `read_lazy` function. Please install xarray."
89-
)
90-
raise ImportError(msg) from e
9186
is_h5_store = isinstance(store, h5py.Dataset | h5py.File | h5py.Group)
9287
is_h5 = (
9388
isinstance(store, PathLike | str) and Path(store).suffix == ".h5ad"

src/anndata/experimental/backed/_lazy_arrays.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
from functools import cached_property
44
from typing import TYPE_CHECKING, Generic, TypeVar
55

6+
import numpy as np
67
import pandas as pd
78

89
from anndata._core.index import _subset
@@ -18,8 +19,6 @@
1819
from pathlib import Path
1920
from typing import Literal
2021

21-
import numpy as np
22-
2322
from anndata._core.index import Index
2423
from anndata.compat import ZarrGroup
2524

@@ -137,7 +136,7 @@ def __init__(
137136

138137
def __getitem__(
139138
self, key: xr.core.indexing.ExplicitIndexer
140-
) -> xr.core.extension_array.PandasExtensionArray:
139+
) -> xr.core.extension_array.PandasExtensionArray | np.ndarray:
141140
values = self._values[key]
142141
mask = self._mask[key]
143142
if self._dtype_str == "nullable-integer":
@@ -146,8 +145,9 @@ def __getitem__(
146145
elif self._dtype_str == "nullable-boolean":
147146
extension_array = pd.arrays.BooleanArray(values, mask=mask)
148147
elif self._dtype_str == "nullable-string-array":
149-
values[mask] = pd.NA
150-
extension_array = pd.array(values, dtype=pd.StringDtype())
148+
# https://github.com/pydata/xarray/issues/10419
149+
values[mask] = np.nan
150+
return values
151151
else:
152152
msg = f"Invalid dtype_str {self._dtype_str}"
153153
raise RuntimeError(msg)
@@ -163,7 +163,8 @@ def dtype(self):
163163
elif self._dtype_str == "nullable-boolean":
164164
return pd.BooleanDtype()
165165
elif self._dtype_str == "nullable-string-array":
166-
return pd.StringDtype()
166+
# https://github.com/pydata/xarray/issues/10419
167+
return np.dtype("O")
167168
msg = f"Invalid dtype_str {self._dtype_str}"
168169
raise RuntimeError(msg)
169170

src/anndata/tests/helpers.py

Lines changed: 20 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
from collections import Counter, defaultdict
77
from collections.abc import Mapping
88
from functools import partial, singledispatch, wraps
9+
from importlib.util import find_spec
910
from string import ascii_letters
1011
from typing import TYPE_CHECKING
1112

@@ -311,7 +312,6 @@ def gen_adata( # noqa: PLR0913
311312
(csr, csc)
312313
"""
313314
import dask.array as da
314-
import xarray as xr
315315

316316
if random_state is None:
317317
random_state = np.random.default_rng()
@@ -325,10 +325,11 @@ def gen_adata( # noqa: PLR0913
325325
obs.rename(columns=dict(cat="obs_cat"), inplace=True)
326326
var.rename(columns=dict(cat="var_cat"), inplace=True)
327327

328-
if obs_xdataset:
329-
obs = XDataset.from_dataframe(obs)
330-
if var_xdataset:
331-
var = XDataset.from_dataframe(var)
328+
if has_xr := find_spec("xarray"):
329+
if obs_xdataset:
330+
obs = XDataset.from_dataframe(obs)
331+
if var_xdataset:
332+
var = XDataset.from_dataframe(var)
332333

333334
if X_type is None:
334335
X = None
@@ -341,27 +342,28 @@ def gen_adata( # noqa: PLR0913
341342
df=gen_typed_df(M, obs_names, dtypes=obs_dtypes),
342343
awk_2d_ragged=gen_awkward((M, None)),
343344
da=da.random.random((M, 50)),
344-
xdataset=xr.Dataset.from_dataframe(
345-
gen_typed_df(M, obs_names, dtypes=obs_dtypes)
346-
),
347-
)
348-
obsm = {k: v for k, v in obsm.items() if type(v) in obsm_types}
349-
obsm = maybe_add_sparse_array(
350-
mapping=obsm,
351-
types=obsm_types,
352-
format=sparse_fmt,
353-
random_state=random_state,
354-
shape=(M, 100),
355345
)
356346
varm = dict(
357347
array=np.random.random((N, 50)),
358348
sparse=sparse.random(N, 100, format=sparse_fmt, random_state=random_state),
359349
df=gen_typed_df(N, var_names, dtypes=var_dtypes),
360350
awk_2d_ragged=gen_awkward((N, None)),
361351
da=da.random.random((N, 50)),
362-
xdataset=xr.Dataset.from_dataframe(
352+
)
353+
if has_xr:
354+
obsm["xdataset"] = XDataset.from_dataframe(
355+
gen_typed_df(M, obs_names, dtypes=obs_dtypes)
356+
)
357+
varm["xdataset"] = XDataset.from_dataframe(
363358
gen_typed_df(N, var_names, dtypes=var_dtypes)
364-
),
359+
)
360+
obsm = {k: v for k, v in obsm.items() if type(v) in obsm_types}
361+
obsm = maybe_add_sparse_array(
362+
mapping=obsm,
363+
types=obsm_types,
364+
format=sparse_fmt,
365+
random_state=random_state,
366+
shape=(M, 100),
365367
)
366368
varm = {k: v for k, v in varm.items() if type(v) in varm_types}
367369
varm = maybe_add_sparse_array(

tests/lazy/test_read.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525

2626
from anndata._types import AnnDataElem
2727

28+
2829
pytestmark = pytest.mark.skipif(not find_spec("xarray"), reason="xarray not installed")
2930

3031

0 commit comments

Comments
 (0)