Skip to content

Commit 8f3299b

Browse files
IO for nullable string arrays (#1558)
Co-authored-by: Ilan Gold <ilanbassgold@gmail.com>
1 parent 17222d4 commit 8f3299b

11 files changed

Lines changed: 280 additions & 94 deletions

File tree

.azure-pipelines.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,7 @@ jobs:
8888
inputs:
8989
codeCoverageTool: Cobertura
9090
summaryFileLocation: "test-data/coverage.xml"
91+
failIfCoverageEmpty: true
9192
condition: eq(variables['TEST_TYPE'], 'coverage')
9293

9394
- task: PublishTestResults@2

docs/release-notes/1558.feature.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
Read and write support for nullable string arrays ({class}`pandas.arrays.StringArray`).
2+
Use pandas’ {doc}`pandas:user_guide/options` `mode.string_storage` to control which storage mode is used when reading `dtype="string"` columns.
3+
{user}`flying-sheep`

pyproject.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -115,8 +115,11 @@ version-file = "src/anndata/_version.py"
115115
packages = ["src/anndata", "src/testing"]
116116

117117
[tool.coverage.run]
118+
data_file = "test-data/coverage"
118119
source_pkgs = ["anndata"]
119120
omit = ["src/anndata/_version.py", "**/test_*.py"]
121+
[tool.coverage.xml]
122+
output = "test-data/coverage.xml"
120123
[tool.coverage.paths]
121124
source = ["./src", "**/site-packages"]
122125

src/anndata/_io/specs/methods.py

Lines changed: 87 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from __future__ import annotations
22

3+
import warnings
34
from collections.abc import Mapping
45
from functools import partial
56
from itertools import product
@@ -10,6 +11,7 @@
1011
import h5py
1112
import numpy as np
1213
import pandas as pd
14+
from packaging.version import Version
1315
from scipy import sparse
1416

1517
import anndata as ad
@@ -37,13 +39,16 @@
3739
_require_group_write_dataframe,
3840
)
3941

42+
from ..._settings import settings
4043
from .registry import _REGISTRY, IOSpec, read_elem, read_elem_partial
4144

4245
if TYPE_CHECKING:
46+
from collections.abc import Callable
4347
from os import PathLike
4448
from typing import Any, Literal
4549

4650
from numpy import typing as npt
51+
from numpy.typing import NDArray
4752

4853
from anndata._types import (
4954
ArrayStorageType,
@@ -506,10 +511,12 @@ def write_vlen_string_array_zarr(
506511
):
507512
import numcodecs
508513

509-
# Workaround for https://github.com/zarr-developers/numcodecs/issues/514
510-
# TODO: Warn to upgrade numcodecs if fixed
511-
if not elem.flags.writeable:
512-
elem = elem.copy()
514+
if Version(numcodecs.__version__) < Version("0.13"):
515+
msg = "Old numcodecs version detected. Please update for improved performance and stability."
516+
warnings.warn(msg)
517+
# Workaround for https://github.com/zarr-developers/numcodecs/issues/514
518+
if hasattr(elem, "flags") and not elem.flags.writeable:
519+
elem = elem.copy()
513520

514521
f.create_dataset(
515522
k,
@@ -1014,44 +1021,85 @@ def read_partial_categorical(elem, *, items=None, indices=(slice(None),)):
10141021
@_REGISTRY.register_write(
10151022
ZarrGroup, pd.arrays.BooleanArray, IOSpec("nullable-boolean", "0.1.0")
10161023
)
1017-
def write_nullable_integer(
1024+
@_REGISTRY.register_write(
1025+
H5Group, pd.arrays.StringArray, IOSpec("nullable-string-array", "0.1.0")
1026+
)
1027+
@_REGISTRY.register_write(
1028+
ZarrGroup, pd.arrays.StringArray, IOSpec("nullable-string-array", "0.1.0")
1029+
)
1030+
def write_nullable(
10181031
f: GroupStorageType,
10191032
k: str,
1020-
v: pd.arrays.IntegerArray | pd.arrays.BooleanArray,
1033+
v: pd.arrays.IntegerArray | pd.arrays.BooleanArray | pd.arrays.StringArray,
10211034
*,
10221035
_writer: Writer,
10231036
dataset_kwargs: Mapping[str, Any] = MappingProxyType({}),
10241037
):
1038+
if (
1039+
isinstance(v, pd.arrays.StringArray)
1040+
and not settings.allow_write_nullable_strings
1041+
):
1042+
msg = (
1043+
"`anndata.settings.allow_write_nullable_strings` is False, "
1044+
"because writing of `pd.arrays.StringArray` is new "
1045+
"and not supported in anndata < 0.11, still use by many people. "
1046+
"Opt-in to writing these arrays by toggling the setting to True."
1047+
)
1048+
raise RuntimeError(msg)
10251049
g = f.require_group(k)
1026-
if v._mask is not None:
1027-
_writer.write_elem(g, "mask", v._mask, dataset_kwargs=dataset_kwargs)
1028-
_writer.write_elem(g, "values", v._data, dataset_kwargs=dataset_kwargs)
1050+
values = (
1051+
v.to_numpy(na_value="")
1052+
if isinstance(v, pd.arrays.StringArray)
1053+
else v.to_numpy(na_value=0, dtype=v.dtype.numpy_dtype)
1054+
)
1055+
_writer.write_elem(g, "values", values, dataset_kwargs=dataset_kwargs)
1056+
_writer.write_elem(g, "mask", v.isna(), dataset_kwargs=dataset_kwargs)
10291057

10301058

1031-
@_REGISTRY.register_read(H5Group, IOSpec("nullable-integer", "0.1.0"))
1032-
@_REGISTRY.register_read(ZarrGroup, IOSpec("nullable-integer", "0.1.0"))
1033-
def read_nullable_integer(
1034-
elem: GroupStorageType, *, _reader: Reader
1059+
def _read_nullable(
1060+
elem: GroupStorageType,
1061+
*,
1062+
_reader: Reader,
1063+
# BaseMaskedArray
1064+
array_type: Callable[
1065+
[NDArray[np.number], NDArray[np.bool_]], pd.api.extensions.ExtensionArray
1066+
],
10351067
) -> pd.api.extensions.ExtensionArray:
1036-
if "mask" in elem:
1037-
return pd.arrays.IntegerArray(
1038-
_reader.read_elem(elem["values"]), mask=_reader.read_elem(elem["mask"])
1039-
)
1040-
else:
1041-
return pd.array(_reader.read_elem(elem["values"]))
1068+
return array_type(
1069+
_reader.read_elem(elem["values"]),
1070+
mask=_reader.read_elem(elem["mask"]),
1071+
)
10421072

10431073

1044-
@_REGISTRY.register_read(H5Group, IOSpec("nullable-boolean", "0.1.0"))
1045-
@_REGISTRY.register_read(ZarrGroup, IOSpec("nullable-boolean", "0.1.0"))
1046-
def read_nullable_boolean(
1047-
elem: GroupStorageType, *, _reader: Reader
1074+
def _string_array(
1075+
values: np.ndarray, mask: np.ndarray
10481076
) -> pd.api.extensions.ExtensionArray:
1049-
if "mask" in elem:
1050-
return pd.arrays.BooleanArray(
1051-
_reader.read_elem(elem["values"]), mask=_reader.read_elem(elem["mask"])
1052-
)
1053-
else:
1054-
return pd.array(_reader.read_elem(elem["values"]))
1077+
"""Construct a string array from values and mask."""
1078+
arr = pd.array(values, dtype="string")
1079+
arr[mask] = pd.NA
1080+
return arr
1081+
1082+
1083+
_REGISTRY.register_read(H5Group, IOSpec("nullable-integer", "0.1.0"))(
1084+
read_nullable_integer := partial(_read_nullable, array_type=pd.arrays.IntegerArray)
1085+
)
1086+
_REGISTRY.register_read(ZarrGroup, IOSpec("nullable-integer", "0.1.0"))(
1087+
read_nullable_integer
1088+
)
1089+
1090+
_REGISTRY.register_read(H5Group, IOSpec("nullable-boolean", "0.1.0"))(
1091+
read_nullable_boolean := partial(_read_nullable, array_type=pd.arrays.BooleanArray)
1092+
)
1093+
_REGISTRY.register_read(ZarrGroup, IOSpec("nullable-boolean", "0.1.0"))(
1094+
read_nullable_boolean
1095+
)
1096+
1097+
_REGISTRY.register_read(H5Group, IOSpec("nullable-string-array", "0.1.0"))(
1098+
read_nullable_string := partial(_read_nullable, array_type=_string_array)
1099+
)
1100+
_REGISTRY.register_read(ZarrGroup, IOSpec("nullable-string-array", "0.1.0"))(
1101+
read_nullable_string
1102+
)
10551103

10561104

10571105
###########
@@ -1091,17 +1139,19 @@ def write_hdf5_scalar(
10911139
f.create_dataset(key, data=np.array(value), **dataset_kwargs)
10921140

10931141

1094-
# fmt: off
10951142
for numeric_scalar_type in [
1096-
bool, np.bool_,
1097-
np.uint8, np.uint16, np.uint32, np.uint64,
1098-
int, np.int8, np.int16, np.int32, np.int64,
1099-
float, *np.floating.__subclasses__(),
1143+
*(bool, np.bool_),
1144+
*(np.uint8, np.uint16, np.uint32, np.uint64),
1145+
*(int, np.int8, np.int16, np.int32, np.int64),
1146+
*(float, *np.floating.__subclasses__()),
11001147
*np.complexfloating.__subclasses__(),
11011148
]:
1102-
_REGISTRY.register_write(H5Group, numeric_scalar_type, IOSpec("numeric-scalar", "0.2.0"))(write_hdf5_scalar)
1103-
_REGISTRY.register_write(ZarrGroup, numeric_scalar_type, IOSpec("numeric-scalar", "0.2.0"))(write_scalar)
1104-
# fmt: on
1149+
_REGISTRY.register_write(
1150+
H5Group, numeric_scalar_type, IOSpec("numeric-scalar", "0.2.0")
1151+
)(write_hdf5_scalar)
1152+
_REGISTRY.register_write(
1153+
ZarrGroup, numeric_scalar_type, IOSpec("numeric-scalar", "0.2.0")
1154+
)(write_scalar)
11051155

11061156
_REGISTRY.register_write(ZarrGroup, str, IOSpec("string", "0.2.0"))(write_scalar)
11071157
_REGISTRY.register_write(ZarrGroup, np.str_, IOSpec("string", "0.2.0"))(write_scalar)

src/anndata/_io/zarr.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,7 @@ def callback(func, elem_name: str, elem, iospec):
103103

104104
@report_read_key_on_error
105105
def read_dataset(dataset: zarr.Array):
106+
"""Legacy method for reading datasets without encoding_type."""
106107
value = dataset[...]
107108
if not hasattr(value, "dtype"):
108109
return value

src/anndata/_settings.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -420,5 +420,13 @@ def validate_bool(val) -> None:
420420
get_from_env=check_and_get_bool,
421421
)
422422

423+
settings.register(
424+
"allow_write_nullable_strings",
425+
default_value=False,
426+
description="Whether or not to allow writing of `pd.arrays.StringArray`.",
427+
validate=validate_bool,
428+
get_from_env=check_and_get_bool,
429+
)
430+
423431
##################################################################################
424432
##################################################################################

0 commit comments

Comments
 (0)