diff --git a/docs/release-notes/2062.bugfix.md b/docs/release-notes/2062.bugfix.md new file mode 100644 index 000000000..d72a295b5 --- /dev/null +++ b/docs/release-notes/2062.bugfix.md @@ -0,0 +1 @@ +Allow reading of non-string index in {func}`anndata.experimental.read_lazy` and {func}`anndata.experimental.read_elem_lazy` for {class}`anndata.experimental.backed.Dataset2D` {user}`ilan-gold` diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index b6ad2e591..bc51ee004 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -1,5 +1,6 @@ from __future__ import annotations +import warnings from contextlib import contextmanager from functools import partial, singledispatch from pathlib import Path @@ -304,7 +305,17 @@ def read_dataframe( if not use_range_index: dim_name = elem.attrs["_index"] # no sense in reading this in multiple times - index = elem_dict[dim_name].compute() + if isinstance(elem_dict[dim_name], DaskArray): + index = elem_dict[dim_name].compute() + else: + import xarray + + index = elem_dict[dim_name][ + xarray.core.indexing.BasicIndexer((slice(None),)) + ].array + msg = f"Found non-string indices {index} in on-disk {elem} store" + warnings.warn(msg, stacklevel=2) + index = pd.Index(index) else: dim_name = DUMMY_RANGE_INDEX_KEY index = pd.RangeIndex(len(elem_dict[elem.attrs["_index"]])).astype("str") diff --git a/tests/lazy/test_read.py b/tests/lazy/test_read.py index 9624668d6..81d8c9780 100644 --- a/tests/lazy/test_read.py +++ b/tests/lazy/test_read.py @@ -211,3 +211,15 @@ def test_chunks_df( for k in ds: if isinstance(arr := ds[k].data, DaskArray): assert arr.chunksize == expected_chunks + + +@pytest.mark.zarr_io +def test_categorical_idx(tmp_path: Path): + adata = AnnData(shape=[10, 20]) + adata.obs.index = adata.obs.index.astype("category") + orig_pth = tmp_path / "categorical_id.zarr" + adata.write_zarr(orig_pth) + with pytest.warns(UserWarning, match=r"non-string indices"): + remote = read_lazy(orig_pth) + remote_to_memory = remote.to_memory() + assert_equal(adata, remote_to_memory)