From 944393790bb470bc3b7ffdf9c6a0dbf28f10589d Mon Sep 17 00:00:00 2001 From: "Philipp A." Date: Mon, 17 Mar 2025 10:08:33 +0100 Subject: [PATCH 1/4] Pass through convert_strings_to_categoricals --- src/anndata/_core/anndata.py | 42 ++++++++++++++++++++------ src/anndata/_core/file_backing.py | 6 ++-- src/anndata/_io/h5ad.py | 9 ++++-- src/anndata/_io/read.py | 16 +++++----- src/anndata/_io/specs/methods.py | 2 +- src/anndata/_io/write.py | 10 ++++-- src/anndata/_io/zarr.py | 3 +- src/anndata/experimental/backed/_io.py | 5 +-- src/anndata/experimental/merge.py | 10 +++--- tests/lazy/test_concat.py | 3 +- tests/test_readwrite.py | 3 +- 11 files changed, 72 insertions(+), 37 deletions(-) diff --git a/src/anndata/_core/anndata.py b/src/anndata/_core/anndata.py index 576321e41..02323a627 100644 --- a/src/anndata/_core/anndata.py +++ b/src/anndata/_core/anndata.py @@ -53,6 +53,8 @@ from os import PathLike from typing import Any, Literal + from zarr.storage import StoreLike + from ..compat import Index1D from ..typing import XDataType from .aligned_mapping import AxisArraysView, LayersView, PairwiseArraysView @@ -216,7 +218,7 @@ def __init__( raw: Mapping[str, Any] | None = None, dtype: np.dtype | type | str | None = None, shape: tuple[int, int] | None = None, - filename: PathLike | None = None, + filename: PathLike[str] | str | None = None, filemode: Literal["r", "r+"] | None = None, asview: bool = False, obsp: np.ndarray | Mapping[str, Sequence[Any]] | None = None, @@ -960,7 +962,7 @@ def filename(self) -> Path | None: return self.file.filename @filename.setter - def filename(self, filename: PathLike | None): + def filename(self, filename: PathLike[str] | str | None): # convert early for later comparison filename = None if filename is None else Path(filename) # change from backing-mode back to full loading into memory @@ -1439,7 +1441,7 @@ def to_memory(self, *, copy: bool = False) -> AnnData: return AnnData(**new) - def copy(self, filename: PathLike | None = None) -> AnnData: + def copy(self, filename: PathLike[str] | str | None = None) -> AnnData: """Full copy, optionally on disk.""" if not self.isbacked: if self.is_view and self._has_X(): @@ -1800,9 +1802,12 @@ def _check_dimensions(self, key=None): ) raise ValueError(msg) + @old_positionals("compression", "compression_opts", "as_dense") def write_h5ad( self, - filename: PathLike | None = None, + filename: PathLike[str] | str | None = None, + *, + convert_strings_to_categoricals: bool = True, compression: Literal["gzip", "lzf"] | None = None, compression_opts: int | Any = None, as_dense: Sequence[str] = (), @@ -1826,6 +1831,8 @@ def write_h5ad( ---------- filename Filename of data file. Defaults to backing file. + convert_strings_to_categoricals + Convert string columns to categorical. compression For [`lzf`, `gzip`], see the h5py :ref:`dataset_compression`. @@ -1880,6 +1887,7 @@ def write_h5ad( write_h5ad( Path(filename), self, + convert_strings_to_categoricals=convert_strings_to_categoricals, compression=compression, compression_opts=compression_opts, as_dense=as_dense, @@ -1891,7 +1899,9 @@ def write_h5ad( write = write_h5ad # a shortcut and backwards compat @old_positionals("skip_data", "sep") - def write_csvs(self, dirname: PathLike, *, skip_data: bool = True, sep: str = ","): + def write_csvs( + self, dirname: PathLike[str] | str, *, skip_data: bool = True, sep: str = "," + ): """\ Write annotation to `.csv` files. @@ -1912,7 +1922,9 @@ def write_csvs(self, dirname: PathLike, *, skip_data: bool = True, sep: str = ", write_csvs(dirname, self, skip_data=skip_data, sep=sep) @old_positionals("write_obsm_varm") - def write_loom(self, filename: PathLike, *, write_obsm_varm: bool = False): + def write_loom( + self, filename: PathLike[str] | str, *, write_obsm_varm: bool = False + ): """\ Write `.loom`-formatted hdf5 file. @@ -1925,10 +1937,13 @@ def write_loom(self, filename: PathLike, *, write_obsm_varm: bool = False): write_loom(filename, self, write_obsm_varm=write_obsm_varm) + @old_positionals("chunks") def write_zarr( self, - store: MutableMapping | PathLike, + store: StoreLike, + *, chunks: tuple[int, ...] | None = None, + convert_strings_to_categoricals: bool = True, ): """\ Write a hierarchical Zarr array store. @@ -1939,6 +1954,8 @@ def write_zarr( The filename, a :class:`~typing.MutableMapping`, or a Zarr storage class. chunks Chunk shape. + convert_strings_to_categoricals + Convert string columns to categorical. """ from ..io import write_zarr @@ -1949,7 +1966,12 @@ def write_zarr( "Please pass `write_zarr(adata)` instead." ) raise ValueError(msg) - write_zarr(store, self, chunks=chunks) + write_zarr( + store, + self, + chunks=chunks, + convert_strings_to_categoricals=convert_strings_to_categoricals, + ) def chunked_X(self, chunk_size: int | None = None): """\ @@ -2090,10 +2112,10 @@ def _infer_shape_for_axis( return elem.shape[0] for elem, id in zip([layers, xxxm, xxxp], ["layers", "xxxm", "xxxp"]): if elem is not None: - elem = cast(Mapping, elem) + elem = cast("Mapping", elem) for sub_elem in elem.values(): if hasattr(sub_elem, "shape"): - size = cast(int, sub_elem.shape[axis if id == "layers" else 0]) + size = cast("int", sub_elem.shape[axis if id == "layers" else 0]) return size return None diff --git a/src/anndata/_core/file_backing.py b/src/anndata/_core/file_backing.py index 005a47b97..45275e651 100644 --- a/src/anndata/_core/file_backing.py +++ b/src/anndata/_core/file_backing.py @@ -26,7 +26,7 @@ class AnnDataFileManager: def __init__( self, adata: anndata.AnnData, - filename: PathLike | None = None, + filename: PathLike[str] | str | None = None, filemode: Literal["r", "r+"] | None = None, ): self._adata_ref = weakref.ref(adata) @@ -81,12 +81,12 @@ def filename(self) -> Path: return self._filename @filename.setter - def filename(self, filename: PathLike | None): + def filename(self, filename: PathLike[str] | str | None): self._filename = None if filename is None else Path(filename) def open( self, - filename: PathLike | None = None, + filename: PathLike[str] | str | None = None, filemode: Literal["r", "r+"] | None = None, ): if filename is not None: diff --git a/src/anndata/_io/h5ad.py b/src/anndata/_io/h5ad.py index 43a390ac0..9a038f7ac 100644 --- a/src/anndata/_io/h5ad.py +++ b/src/anndata/_io/h5ad.py @@ -36,6 +36,7 @@ if TYPE_CHECKING: from collections.abc import Callable, Collection, Mapping, Sequence + from os import PathLike from typing import Any, Literal from .._core.file_backing import AnnDataFileManager @@ -44,7 +45,7 @@ def write_h5ad( - filepath: Path | str, + filepath: PathLike[str] | str, adata: AnnData, *, as_dense: Sequence[str] = (), @@ -140,7 +141,9 @@ def write_sparse_as_dense( del f[key] -def read_h5ad_backed(filename: str | Path, mode: Literal["r", "r+"]) -> AnnData: +def read_h5ad_backed( + filename: str | PathLike[str], mode: Literal["r", "r+"] +) -> AnnData: d = dict(filename=filename, filemode=mode) f = h5py.File(filename, mode) @@ -169,7 +172,7 @@ def read_h5ad_backed(filename: str | Path, mode: Literal["r", "r+"]) -> AnnData: def read_h5ad( - filename: str | Path, + filename: PathLike[str] | str, backed: Literal["r", "r+"] | bool | None = None, *, as_sparse: Sequence[str] = (), diff --git a/src/anndata/_io/read.py b/src/anndata/_io/read.py index a472b6879..c652e2f8c 100644 --- a/src/anndata/_io/read.py +++ b/src/anndata/_io/read.py @@ -23,7 +23,7 @@ def read_csv( - filename: PathLike | Iterator[str], + filename: PathLike[str] | str | Iterator[str], delimiter: str | None = ",", first_column_names: bool | None = None, dtype: str = "float32", @@ -49,7 +49,9 @@ def read_csv( return read_text(filename, delimiter, first_column_names, dtype) -def read_excel(filename: PathLike, sheet: str | int, dtype: str = "float32") -> AnnData: +def read_excel( + filename: PathLike[str] | str, sheet: str | int, dtype: str = "float32" +) -> AnnData: """\ Read `.xlsx` (Excel) file. @@ -73,7 +75,7 @@ def read_excel(filename: PathLike, sheet: str | int, dtype: str = "float32") -> return AnnData(X, row, col) -def read_umi_tools(filename: PathLike, dtype=None) -> AnnData: +def read_umi_tools(filename: PathLike[str] | str, dtype=None) -> AnnData: """\ Read a gzipped condensed count matrix from umi_tools. @@ -96,7 +98,7 @@ def read_umi_tools(filename: PathLike, dtype=None) -> AnnData: return AnnData(X=X, obs=obs, var=var) -def read_hdf(filename: PathLike, key: str) -> AnnData: +def read_hdf(filename: PathLike[str] | str, key: str) -> AnnData: """\ Read `.h5` (hdf5) file. @@ -152,7 +154,7 @@ def _fmt_loom_axis_attrs( @_deprecate_positional_args(version="0.9") def read_loom( - filename: PathLike, + filename: PathLike[str] | str, *, sparse: bool = True, cleanup: bool = False, @@ -295,7 +297,7 @@ def read_loom( return adata -def read_mtx(filename: PathLike, dtype: str = "float32") -> AnnData: +def read_mtx(filename: PathLike[str] | str, dtype: str = "float32") -> AnnData: """\ Read `.mtx` file. @@ -317,7 +319,7 @@ def read_mtx(filename: PathLike, dtype: str = "float32") -> AnnData: def read_text( - filename: PathLike | Iterator[str], + filename: PathLike[str] | str | Iterator[str], delimiter: str | None = None, first_column_names: bool | None = None, dtype: str = "float32", diff --git a/src/anndata/_io/specs/methods.py b/src/anndata/_io/specs/methods.py index 71ded5531..1e3a58830 100644 --- a/src/anndata/_io/specs/methods.py +++ b/src/anndata/_io/specs/methods.py @@ -193,7 +193,7 @@ def read_indices(group): def read_partial( - pth: PathLike, + pth: PathLike[str] | str, *, obs_idx=slice(None), var_idx=slice(None), diff --git a/src/anndata/_io/write.py b/src/anndata/_io/write.py index 467249e03..89fccd579 100644 --- a/src/anndata/_io/write.py +++ b/src/anndata/_io/write.py @@ -24,7 +24,11 @@ @old_positionals("skip_data", "sep") def write_csvs( - dirname: PathLike, adata: AnnData, *, skip_data: bool = True, sep: str = "," + dirname: PathLike[str] | str, + adata: AnnData, + *, + skip_data: bool = True, + sep: str = ",", ): """See :meth:`~anndata.AnnData.write_csvs`.""" dirname = Path(dirname) @@ -78,7 +82,9 @@ def write_csvs( @old_positionals("write_obsm_varm") -def write_loom(filename: PathLike, adata: AnnData, *, write_obsm_varm: bool = False): +def write_loom( + filename: PathLike[str] | str, adata: AnnData, *, write_obsm_varm: bool = False +): filename = Path(filename) row_attrs = {k: np.array(v) for k, v in adata.var.to_dict("list").items()} row_names = adata.var_names diff --git a/src/anndata/_io/zarr.py b/src/anndata/_io/zarr.py index 4718a8940..0432821d8 100644 --- a/src/anndata/_io/zarr.py +++ b/src/anndata/_io/zarr.py @@ -19,6 +19,7 @@ if TYPE_CHECKING: from collections.abc import MutableMapping + from os import PathLike from zarr.core.common import AccessModeLiteral from zarr.storage import StoreLike @@ -61,7 +62,7 @@ def callback(func, s, k: str, elem, dataset_kwargs, iospec): zarr.consolidate_metadata(f.store) -def read_zarr(store: str | Path | MutableMapping | zarr.Group) -> AnnData: +def read_zarr(store: PathLike[str] | str | MutableMapping | zarr.Group) -> AnnData: """\ Read from a hierarchical Zarr array store. diff --git a/src/anndata/experimental/backed/_io.py b/src/anndata/experimental/backed/_io.py index e64919dba..9e7bab131 100644 --- a/src/anndata/experimental/backed/_io.py +++ b/src/anndata/experimental/backed/_io.py @@ -2,6 +2,7 @@ import typing import warnings +from os import PathLike from pathlib import Path from typing import TYPE_CHECKING @@ -25,7 +26,7 @@ @doctest_needs("xarray") def read_lazy( - store: str | Path | MutableMapping | ZarrGroup | h5py.Dataset, + store: PathLike[str] | str | MutableMapping | ZarrGroup | h5py.Dataset, *, load_annotation_index: bool = True, ) -> AnnData: @@ -89,7 +90,7 @@ def read_lazy( raise ImportError(msg) is_h5_store = isinstance(store, h5py.Dataset | h5py.File | h5py.Group) is_h5 = ( - isinstance(store, Path | str) and Path(store).suffix == ".h5ad" + isinstance(store, PathLike | str) and Path(store).suffix == ".h5ad" ) or is_h5_store has_keys = True # true if consolidated or h5ad diff --git a/src/anndata/experimental/merge.py b/src/anndata/experimental/merge.py index 1cb447f23..2a1644c7a 100644 --- a/src/anndata/experimental/merge.py +++ b/src/anndata/experimental/merge.py @@ -1,9 +1,9 @@ from __future__ import annotations -import os import shutil from collections.abc import Mapping from functools import singledispatch +from os import PathLike from pathlib import Path from typing import TYPE_CHECKING @@ -105,9 +105,9 @@ def as_group(store, *, mode: str) -> ZarrGroup | H5Group: raise NotImplementedError(msg) -@as_group.register(os.PathLike) +@as_group.register(PathLike) @as_group.register(str) -def _(store: os.PathLike | str, *, mode: str) -> ZarrGroup | H5Group: +def _(store: PathLike[str] | str, *, mode: str) -> ZarrGroup | H5Group: store = Path(store) if store.suffix == ".h5ad": import h5py @@ -410,8 +410,8 @@ def _write_axis_annot( def concat_on_disk( - in_files: Collection[str | os.PathLike] | Mapping[str, str | os.PathLike], - out_file: str | os.PathLike, + in_files: Collection[PathLike[str] | str] | Mapping[str, PathLike[str] | str], + out_file: PathLike[str] | str, *, max_loaded_elems: int = 100_000_000, axis: Literal["obs", 0, "var", 1] = 0, diff --git a/tests/lazy/test_concat.py b/tests/lazy/test_concat.py index d732c316b..d2c5efdb4 100644 --- a/tests/lazy/test_concat.py +++ b/tests/lazy/test_concat.py @@ -19,6 +19,7 @@ if TYPE_CHECKING: from collections.abc import Callable + from pathlib import Path from typing import Literal from numpy.typing import NDArray @@ -314,7 +315,7 @@ def with_elem_in_memory(adata: AnnData, attr: str, key: str | None) -> AnnData: assert_equal(mixed_concatenated, in_memory_concatenated) -def test_concat_bad_mixed_types(tmp_path: str): +def test_concat_bad_mixed_types(tmp_path: Path): orig = gen_adata((100, 200), np.array) orig.write_zarr(tmp_path) remote = read_lazy(tmp_path) diff --git a/tests/test_readwrite.py b/tests/test_readwrite.py index 8d141689f..1ee8186fd 100644 --- a/tests/test_readwrite.py +++ b/tests/test_readwrite.py @@ -32,7 +32,6 @@ from anndata.tests.helpers import as_dense_dask_array, assert_equal, gen_adata if TYPE_CHECKING: - from os import PathLike from typing import Literal HERE = Path(__file__).parent @@ -541,7 +540,7 @@ def test_write_csv_view(typ, tmp_path): # https://github.com/scverse/anndata/issues/401 import hashlib - def md5_path(pth: PathLike) -> bytes: + def md5_path(pth: Path) -> bytes: checksum = hashlib.md5() with pth.open("rb") as f: while True: From 86e029dfb327e3dabf5923d638fe1feced596459 Mon Sep 17 00:00:00 2001 From: "Philipp A." Date: Mon, 17 Mar 2025 10:38:00 +0100 Subject: [PATCH 2/4] add exports --- docs/api.md | 17 ++++++++++++++++- docs/conf.py | 3 +++ src/anndata/_io/h5ad.py | 1 + src/anndata/_io/write.py | 3 ++- src/anndata/_io/zarr.py | 1 + 5 files changed, 23 insertions(+), 2 deletions(-) diff --git a/docs/api.md b/docs/api.md index 951786f81..11d15546e 100644 --- a/docs/api.md +++ b/docs/api.md @@ -69,13 +69,28 @@ You might have more success by assembling the {class}`AnnData` object yourself f ## Writing Writing a complete {class}`AnnData` object to disk in anndata’s native formats `.h5ad` and `zarr`. +(These functions are also exported as {func}`io.write_h5ad` and {func}`io.write_zarr`.) ```{eval-rst} .. autosummary:: :toctree: generated/ - AnnData.write + AnnData.write_h5ad AnnData.write_zarr + + +.. + .. autosummary:: + :toctree: generated/ + + io.write_h5ad + io.write_zarr + +.. toctree:: + :hidden: + + generated/anndata.io.write_h5ad + generated/anndata.io.write_zarr ``` Writing individual portions ({attr}`~AnnData.obs`, {attr}`~AnnData.varm` etc.) of the {class}`AnnData` object. diff --git a/docs/conf.py b/docs/conf.py index 58a21cc3b..6f964b6d1 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -132,6 +132,9 @@ def setup(app: Sphinx): "anndata._types.Write": "anndata.experimental.Write", "zarr.core.array.Array": "zarr.Array", "zarr.core.group.Group": "zarr.Group", + # Buffer is not yet exported, so the buffer class registry is the closest thing + "zarr.core.buffer.core.Buffer": "zarr.registry.Registry", + "zarr.storage._common.StorePath": "zarr.storage.StorePath", "anndata.compat.DaskArray": "dask.array.Array", "anndata.compat.CupyArray": "cupy.ndarray", "anndata.compat.CupySparseMatrix": "cupyx.scipy.sparse.spmatrix", diff --git a/src/anndata/_io/h5ad.py b/src/anndata/_io/h5ad.py index 9a038f7ac..6a976e2f5 100644 --- a/src/anndata/_io/h5ad.py +++ b/src/anndata/_io/h5ad.py @@ -53,6 +53,7 @@ def write_h5ad( dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), **kwargs, ) -> None: + """See :meth:`~anndata.AnnData.write_h5ad`.""" if isinstance(as_dense, str): as_dense = [as_dense] if "raw.X" in as_dense: diff --git a/src/anndata/_io/write.py b/src/anndata/_io/write.py index 89fccd579..5e4c26726 100644 --- a/src/anndata/_io/write.py +++ b/src/anndata/_io/write.py @@ -84,7 +84,8 @@ def write_csvs( @old_positionals("write_obsm_varm") def write_loom( filename: PathLike[str] | str, adata: AnnData, *, write_obsm_varm: bool = False -): +) -> None: + """See :meth:`~anndata.AnnData.write_loom`.""" filename = Path(filename) row_attrs = {k: np.array(v) for k, v in adata.var.to_dict("list").items()} row_names = adata.var_names diff --git a/src/anndata/_io/zarr.py b/src/anndata/_io/zarr.py index 0432821d8..20d257b8d 100644 --- a/src/anndata/_io/zarr.py +++ b/src/anndata/_io/zarr.py @@ -35,6 +35,7 @@ def write_zarr( convert_strings_to_categoricals: bool = True, **ds_kwargs, ) -> None: + """See :meth:`~anndata.AnnData.write_zarr`.""" if isinstance(store, Path): store = str(store) if convert_strings_to_categoricals: From 233e956d7f9367f40183a9ae2da9a0e091b6b836 Mon Sep 17 00:00:00 2001 From: "Philipp A." Date: Mon, 17 Mar 2025 11:27:32 +0100 Subject: [PATCH 3/4] relnote --- docs/release-notes/1914.bugfix.md | 1 + 1 file changed, 1 insertion(+) create mode 100644 docs/release-notes/1914.bugfix.md diff --git a/docs/release-notes/1914.bugfix.md b/docs/release-notes/1914.bugfix.md new file mode 100644 index 000000000..b22d80857 --- /dev/null +++ b/docs/release-notes/1914.bugfix.md @@ -0,0 +1 @@ +Add `convert_strings_to_categoricals` parameter also to {meth}`~anndata.AnnData.write_h5ad` and {meth}`~anndata.AnnData.write_zarr` as intended {user}`flying-sheep` From a26022be7d96500add118d1ec49d2d6027287956 Mon Sep 17 00:00:00 2001 From: "Philipp A." Date: Mon, 17 Mar 2025 13:15:00 +0100 Subject: [PATCH 4/4] add test for convert_strings_to_categoricals --- tests/test_readwrite.py | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/tests/test_readwrite.py b/tests/test_readwrite.py index 1ee8186fd..d459d4ed1 100644 --- a/tests/test_readwrite.py +++ b/tests/test_readwrite.py @@ -607,20 +607,24 @@ def test_read_umi_tools(): assert set(adata.obs_names) == {"ACAAGG", "TTCACG"} -def test_write_categorical(tmp_path, diskfmt): +@pytest.mark.parametrize("s2c", [True, False], ids=["str2cat", "preserve"]) +def test_write_categorical( + *, tmp_path: Path, diskfmt: Literal["h5ad", "zarr"], s2c: bool +) -> None: + ad.settings.allow_write_nullable_strings = True adata_pth = tmp_path / f"adata.{diskfmt}" - orig = ad.AnnData( - obs=pd.DataFrame( - dict( - cat1=["a", "a", "b", np.nan, np.nan], - cat2=pd.Categorical(["a", "a", "b", np.nan, np.nan]), - ) - ), + obs = dict( + str=pd.array(["a", "a", "b", pd.NA, pd.NA], dtype="string"), + cat=pd.Categorical(["a", "a", "b", np.nan, np.nan]), + **(dict(obj=["a", "a", "b", np.nan, np.nan]) if s2c else {}), ) - getattr(orig, f"write_{diskfmt}")(adata_pth) - curr = getattr(ad, f"read_{diskfmt}")(adata_pth) + orig = ad.AnnData(obs=pd.DataFrame(obs)) + getattr(orig, f"write_{diskfmt}")(adata_pth, convert_strings_to_categoricals=s2c) + curr: ad.AnnData = getattr(ad, f"read_{diskfmt}")(adata_pth) assert np.all(orig.obs.notna() == curr.obs.notna()) assert np.all(orig.obs.stack().dropna() == curr.obs.stack().dropna()) + assert curr.obs["str"].dtype == ("category" if s2c else "string") + assert curr.obs["cat"].dtype == "category" def test_write_categorical_index(tmp_path, diskfmt):