diff --git a/docs/release-notes/2159.perf.md b/docs/release-notes/2159.perf.md new file mode 100644 index 000000000..195c92222 --- /dev/null +++ b/docs/release-notes/2159.perf.md @@ -0,0 +1 @@ +Add a `write_csr_csc_indices_with_min_possible_dtype` option to {attr}`anndata.settings` to enable downcasting of the `indices` of csr and csc matrices to a smaller dtype when writing. For example, if your csr matrix only has 30000 columns, then you can write out the `indices` of that matrix as `uint16` instead of `int64`. {user}`ilan-gold` diff --git a/src/anndata/_io/specs/methods.py b/src/anndata/_io/specs/methods.py index 931b63f22..c5f5fd08c 100644 --- a/src/anndata/_io/specs/methods.py +++ b/src/anndata/_io/specs/methods.py @@ -727,6 +727,24 @@ def write_sparse_compressed( for attr_name in ["data", "indices", "indptr"]: attr = getattr(value, attr_name) dtype = indptr_dtype if attr_name == "indptr" else attr.dtype + if ( + attr_name == "indices" + and settings.write_csr_csc_indices_with_min_possible_dtype + ): + # np.min_scalar_type can return things like np.ulonglong which zarr doesn't understand + # and I find this clearer as to what the result type is i.e., unsigned or signed. + # For example `np.iinfo(np.uint16).max + 1` could be either `uint32` or `int32`, + # and there's nothing in numpy's docs disallowing this output to change. + if (minor_axis_size := value.shape[value.format == "csr"]) <= np.iinfo( + np.uint8 + ).max: + dtype = np.dtype("uint8") + elif minor_axis_size <= np.iinfo(np.uint16).max: + dtype = np.dtype("uint16") + elif minor_axis_size <= np.iinfo(np.uint32).max: + dtype = np.dtype("uint32") + elif minor_axis_size <= np.iinfo(np.uint64).max: + dtype = np.dtype("uint64") if isinstance(f, H5Group) or is_zarr_v2(): g.create_dataset( attr_name, data=attr, shape=attr.shape, dtype=dtype, **dataset_kwargs diff --git a/src/anndata/_settings.py b/src/anndata/_settings.py index 1003b9bbe..2ceedabf0 100644 --- a/src/anndata/_settings.py +++ b/src/anndata/_settings.py @@ -500,6 +500,14 @@ def validate_sparse_settings(val: Any, settings: SettingsManager) -> None: get_from_env=check_and_get_bool, ) +settings.register( + "write_csr_csc_indices_with_min_possible_dtype", + default_value=False, + description="Write a csr or csc matrix with the minimum possible data type for `indices`, always unsigned integer.", + validate=validate_bool, + get_from_env=check_and_get_bool, +) + settings.register( "auto_shard_zarr_v3", default_value=False, diff --git a/src/anndata/_settings.pyi b/src/anndata/_settings.pyi index 5b16b74fc..a3dbb3611 100644 --- a/src/anndata/_settings.pyi +++ b/src/anndata/_settings.pyi @@ -44,6 +44,7 @@ class _AnnDataSettingsManager(SettingsManager): use_sparse_array_on_read: bool = False min_rows_for_chunked_h5_copy: int = 1000 disallow_forward_slash_in_h5ad: bool = False + write_csr_csc_indices_with_min_possible_dtype: bool = False auto_shard_zarr_v3: bool = False settings: _AnnDataSettingsManager diff --git a/tests/test_io_elementwise.py b/tests/test_io_elementwise.py index d44e9c312..27c99bda7 100644 --- a/tests/test_io_elementwise.py +++ b/tests/test_io_elementwise.py @@ -430,6 +430,113 @@ def test_write_indptr_dtype_override(store, sparse_format): np.testing.assert_array_equal(store["X/indptr"][...], X.indptr) +@pytest.mark.parametrize( + ("num_minor_axis", "expected_dtype"), + [ + pytest.param(1, np.dtype("uint8"), id="one_col-expected_uint8_on_disk"), + pytest.param( + np.iinfo(np.uint8).max, + np.dtype("uint8"), + id="max_np.uint8-matching_dtype_on_disk", + ), + pytest.param( + np.iinfo(np.int8).max, + np.dtype("uint8"), + id="max_np.int8-uint8_on_disk", + ), + pytest.param( + np.iinfo(np.uint16).max, + np.dtype("uint16"), + id="max_np.uint16-matching_dtype_on_disk", + ), + pytest.param( + np.iinfo(np.int16).max, + np.dtype("uint16"), + id="max_np.int16-uint16_on_disk", + ), + pytest.param( + np.iinfo(np.uint32).max, + np.dtype("uint32"), + id="max_np.uint32-matching_dtype_on_disk", + ), + pytest.param( + np.iinfo(np.int32).max, + np.dtype("uint32"), + id="max_np.int32-uint32_on_disk", + ), + pytest.param( + np.iinfo(np.uint8).max + 1, + np.dtype("uint16"), + id="max_np.uint8_plus_one_cols-expected_uint16_on_disk", + ), + pytest.param( + np.iinfo(np.uint16).max + 1, + np.dtype("uint32"), + id="max_np.uint16_plus_one_cols-expected_uint32_on_disk", + ), + pytest.param( + np.iinfo(np.uint32).max + 1, + np.dtype("uint64"), + id="max_np.uint32_plus_one_cols-expected_uint64_on_disk", + ), + pytest.param( + np.iinfo(np.int64).max + 1, + np.dtype("uint64"), + id="max_np.int64_plus_one_cols-expected_uint64_on_disk", + marks=pytest.mark.xfail( + reason="scipy sparse does not support bigger than max(int64) values in indices and there is no uint128." + ), + ), + pytest.param( + np.iinfo(np.uint64).max + 1, + np.dtype("uint64"), + id="max_np.uint64_plus_one_cols-expected_uint64_on_disk", + marks=pytest.mark.xfail( + reason="scipy sparse does not support bigger than max(int64) values in indices and there is no uint128." + ), + ), + ], +) +@pytest.mark.parametrize("format", ["csr", "csc"]) +def test_write_indices_min( + store: H5Group | ZarrGroup, + num_minor_axis: int, + expected_dtype: np.dtype, + format: Literal["csr", "csc"], +): + minor_axis_index = np.array([num_minor_axis - 1]) + major_axis_index = np.array([10]) + row_cols = ( + (minor_axis_index, major_axis_index) + if format == "csc" + else (major_axis_index, minor_axis_index) + ) + shape = (num_minor_axis, 20) if format == "csc" else (20, num_minor_axis) + X = getattr(sparse, f"{format}_array")( + (np.array([10]), row_cols), + shape=shape, + ) + assert X.nnz == 1 + with ad.settings.override(write_csr_csc_indices_with_min_possible_dtype=True): + write_elem(store, "X", X) + + assert store["X/indices"].dtype == expected_dtype + with ad.settings.override(use_sparse_array_on_read=True): + result = read_elem(store["X"]) + assert_equal(result.data, X.data) + assert_equal(result.indices, X.indices) + assert_equal(result.indptr, X.indptr) + assert X.format == result.format + assert result.shape == X.shape + # != comparison converts to csr, which allocates a lot of memory or errors out with: + # ValueError: array is too big; `arr.size * arr.dtype.itemsize` is larger than the maximum possible size. + # Because the old, very large, minor axis is now the major axis and so either it fails to create or the indptr is very big. + # The above tests should be enough to capture the desired equality checks so this is mostly for being extra sure. + # See https://github.com/scipy/scipy/issues/23826 + if not (format == "csc" and num_minor_axis > np.iinfo(np.uint16).max + 1): + assert (result != X).nnz == 0 + + def test_io_spec_raw(store): adata = gen_adata((3, 2), **GEN_ADATA_NO_XARRAY_ARGS) adata.raw = adata.copy()