|
1 | 1 | from __future__ import annotations |
2 | 2 |
|
| 3 | +import warnings |
3 | 4 | from collections.abc import Mapping |
4 | 5 | from functools import partial |
5 | 6 | from itertools import product |
|
10 | 11 | import h5py |
11 | 12 | import numpy as np |
12 | 13 | import pandas as pd |
| 14 | +from packaging.version import Version |
13 | 15 | from scipy import sparse |
14 | 16 |
|
15 | 17 | import anndata as ad |
|
37 | 39 | _require_group_write_dataframe, |
38 | 40 | ) |
39 | 41 |
|
| 42 | +from ..._settings import settings |
40 | 43 | from .registry import _REGISTRY, IOSpec, read_elem, read_elem_partial |
41 | 44 |
|
42 | 45 | if TYPE_CHECKING: |
| 46 | + from collections.abc import Callable |
43 | 47 | from os import PathLike |
44 | 48 | from typing import Any, Literal |
45 | 49 |
|
46 | 50 | from numpy import typing as npt |
| 51 | + from numpy.typing import NDArray |
47 | 52 |
|
48 | 53 | from anndata._types import ( |
49 | 54 | ArrayStorageType, |
@@ -506,10 +511,12 @@ def write_vlen_string_array_zarr( |
506 | 511 | ): |
507 | 512 | import numcodecs |
508 | 513 |
|
509 | | - # Workaround for https://github.com/zarr-developers/numcodecs/issues/514 |
510 | | - # TODO: Warn to upgrade numcodecs if fixed |
511 | | - if not elem.flags.writeable: |
512 | | - elem = elem.copy() |
| 514 | + if Version(numcodecs.__version__) < Version("0.13"): |
| 515 | + msg = "Old numcodecs version detected. Please update for improved performance and stability." |
| 516 | + warnings.warn(msg) |
| 517 | + # Workaround for https://github.com/zarr-developers/numcodecs/issues/514 |
| 518 | + if hasattr(elem, "flags") and not elem.flags.writeable: |
| 519 | + elem = elem.copy() |
513 | 520 |
|
514 | 521 | f.create_dataset( |
515 | 522 | k, |
@@ -1014,44 +1021,85 @@ def read_partial_categorical(elem, *, items=None, indices=(slice(None),)): |
1014 | 1021 | @_REGISTRY.register_write( |
1015 | 1022 | ZarrGroup, pd.arrays.BooleanArray, IOSpec("nullable-boolean", "0.1.0") |
1016 | 1023 | ) |
1017 | | -def write_nullable_integer( |
| 1024 | +@_REGISTRY.register_write( |
| 1025 | + H5Group, pd.arrays.StringArray, IOSpec("nullable-string-array", "0.1.0") |
| 1026 | +) |
| 1027 | +@_REGISTRY.register_write( |
| 1028 | + ZarrGroup, pd.arrays.StringArray, IOSpec("nullable-string-array", "0.1.0") |
| 1029 | +) |
| 1030 | +def write_nullable( |
1018 | 1031 | f: GroupStorageType, |
1019 | 1032 | k: str, |
1020 | | - v: pd.arrays.IntegerArray | pd.arrays.BooleanArray, |
| 1033 | + v: pd.arrays.IntegerArray | pd.arrays.BooleanArray | pd.arrays.StringArray, |
1021 | 1034 | *, |
1022 | 1035 | _writer: Writer, |
1023 | 1036 | dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), |
1024 | 1037 | ): |
| 1038 | + if ( |
| 1039 | + isinstance(v, pd.arrays.StringArray) |
| 1040 | + and not settings.allow_write_nullable_strings |
| 1041 | + ): |
| 1042 | + msg = ( |
| 1043 | + "`anndata.settings.allow_write_nullable_strings` is False, " |
| 1044 | + "because writing of `pd.arrays.StringArray` is new " |
| 1045 | + "and not supported in anndata < 0.11, still use by many people. " |
| 1046 | + "Opt-in to writing these arrays by toggling the setting to True." |
| 1047 | + ) |
| 1048 | + raise RuntimeError(msg) |
1025 | 1049 | g = f.require_group(k) |
1026 | | - if v._mask is not None: |
1027 | | - _writer.write_elem(g, "mask", v._mask, dataset_kwargs=dataset_kwargs) |
1028 | | - _writer.write_elem(g, "values", v._data, dataset_kwargs=dataset_kwargs) |
| 1050 | + values = ( |
| 1051 | + v.to_numpy(na_value="") |
| 1052 | + if isinstance(v, pd.arrays.StringArray) |
| 1053 | + else v.to_numpy(na_value=0, dtype=v.dtype.numpy_dtype) |
| 1054 | + ) |
| 1055 | + _writer.write_elem(g, "values", values, dataset_kwargs=dataset_kwargs) |
| 1056 | + _writer.write_elem(g, "mask", v.isna(), dataset_kwargs=dataset_kwargs) |
1029 | 1057 |
|
1030 | 1058 |
|
1031 | | -@_REGISTRY.register_read(H5Group, IOSpec("nullable-integer", "0.1.0")) |
1032 | | -@_REGISTRY.register_read(ZarrGroup, IOSpec("nullable-integer", "0.1.0")) |
1033 | | -def read_nullable_integer( |
1034 | | - elem: GroupStorageType, *, _reader: Reader |
| 1059 | +def _read_nullable( |
| 1060 | + elem: GroupStorageType, |
| 1061 | + *, |
| 1062 | + _reader: Reader, |
| 1063 | + # BaseMaskedArray |
| 1064 | + array_type: Callable[ |
| 1065 | + [NDArray[np.number], NDArray[np.bool_]], pd.api.extensions.ExtensionArray |
| 1066 | + ], |
1035 | 1067 | ) -> pd.api.extensions.ExtensionArray: |
1036 | | - if "mask" in elem: |
1037 | | - return pd.arrays.IntegerArray( |
1038 | | - _reader.read_elem(elem["values"]), mask=_reader.read_elem(elem["mask"]) |
1039 | | - ) |
1040 | | - else: |
1041 | | - return pd.array(_reader.read_elem(elem["values"])) |
| 1068 | + return array_type( |
| 1069 | + _reader.read_elem(elem["values"]), |
| 1070 | + mask=_reader.read_elem(elem["mask"]), |
| 1071 | + ) |
1042 | 1072 |
|
1043 | 1073 |
|
1044 | | -@_REGISTRY.register_read(H5Group, IOSpec("nullable-boolean", "0.1.0")) |
1045 | | -@_REGISTRY.register_read(ZarrGroup, IOSpec("nullable-boolean", "0.1.0")) |
1046 | | -def read_nullable_boolean( |
1047 | | - elem: GroupStorageType, *, _reader: Reader |
| 1074 | +def _string_array( |
| 1075 | + values: np.ndarray, mask: np.ndarray |
1048 | 1076 | ) -> pd.api.extensions.ExtensionArray: |
1049 | | - if "mask" in elem: |
1050 | | - return pd.arrays.BooleanArray( |
1051 | | - _reader.read_elem(elem["values"]), mask=_reader.read_elem(elem["mask"]) |
1052 | | - ) |
1053 | | - else: |
1054 | | - return pd.array(_reader.read_elem(elem["values"])) |
| 1077 | + """Construct a string array from values and mask.""" |
| 1078 | + arr = pd.array(values, dtype="string") |
| 1079 | + arr[mask] = pd.NA |
| 1080 | + return arr |
| 1081 | + |
| 1082 | + |
| 1083 | +_REGISTRY.register_read(H5Group, IOSpec("nullable-integer", "0.1.0"))( |
| 1084 | + read_nullable_integer := partial(_read_nullable, array_type=pd.arrays.IntegerArray) |
| 1085 | +) |
| 1086 | +_REGISTRY.register_read(ZarrGroup, IOSpec("nullable-integer", "0.1.0"))( |
| 1087 | + read_nullable_integer |
| 1088 | +) |
| 1089 | + |
| 1090 | +_REGISTRY.register_read(H5Group, IOSpec("nullable-boolean", "0.1.0"))( |
| 1091 | + read_nullable_boolean := partial(_read_nullable, array_type=pd.arrays.BooleanArray) |
| 1092 | +) |
| 1093 | +_REGISTRY.register_read(ZarrGroup, IOSpec("nullable-boolean", "0.1.0"))( |
| 1094 | + read_nullable_boolean |
| 1095 | +) |
| 1096 | + |
| 1097 | +_REGISTRY.register_read(H5Group, IOSpec("nullable-string-array", "0.1.0"))( |
| 1098 | + read_nullable_string := partial(_read_nullable, array_type=_string_array) |
| 1099 | +) |
| 1100 | +_REGISTRY.register_read(ZarrGroup, IOSpec("nullable-string-array", "0.1.0"))( |
| 1101 | + read_nullable_string |
| 1102 | +) |
1055 | 1103 |
|
1056 | 1104 |
|
1057 | 1105 | ########### |
@@ -1091,17 +1139,19 @@ def write_hdf5_scalar( |
1091 | 1139 | f.create_dataset(key, data=np.array(value), **dataset_kwargs) |
1092 | 1140 |
|
1093 | 1141 |
|
1094 | | -# fmt: off |
1095 | 1142 | for numeric_scalar_type in [ |
1096 | | - bool, np.bool_, |
1097 | | - np.uint8, np.uint16, np.uint32, np.uint64, |
1098 | | - int, np.int8, np.int16, np.int32, np.int64, |
1099 | | - float, *np.floating.__subclasses__(), |
| 1143 | + *(bool, np.bool_), |
| 1144 | + *(np.uint8, np.uint16, np.uint32, np.uint64), |
| 1145 | + *(int, np.int8, np.int16, np.int32, np.int64), |
| 1146 | + *(float, *np.floating.__subclasses__()), |
1100 | 1147 | *np.complexfloating.__subclasses__(), |
1101 | 1148 | ]: |
1102 | | - _REGISTRY.register_write(H5Group, numeric_scalar_type, IOSpec("numeric-scalar", "0.2.0"))(write_hdf5_scalar) |
1103 | | - _REGISTRY.register_write(ZarrGroup, numeric_scalar_type, IOSpec("numeric-scalar", "0.2.0"))(write_scalar) |
1104 | | -# fmt: on |
| 1149 | + _REGISTRY.register_write( |
| 1150 | + H5Group, numeric_scalar_type, IOSpec("numeric-scalar", "0.2.0") |
| 1151 | + )(write_hdf5_scalar) |
| 1152 | + _REGISTRY.register_write( |
| 1153 | + ZarrGroup, numeric_scalar_type, IOSpec("numeric-scalar", "0.2.0") |
| 1154 | + )(write_scalar) |
1105 | 1155 |
|
1106 | 1156 | _REGISTRY.register_write(ZarrGroup, str, IOSpec("string", "0.2.0"))(write_scalar) |
1107 | 1157 | _REGISTRY.register_write(ZarrGroup, np.str_, IOSpec("string", "0.2.0"))(write_scalar) |
|
0 commit comments