Skip to content
Open
Show file tree
Hide file tree
Changes from 16 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions docs/api.md
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,20 @@ Writing formats that cannot represent all aspects of {class}`AnnData` objects.
AnnData.write_loom
```

(utilities-api)=

## Utilities

Helper functions used internationally or for reshaping and aligng `AnnData` objects. Can be useful for cusotm workflows or edge cases.

```{eval-rst}
.. autosummary::
:toctree: generated/

utils.adapt_vars_like

```

(experimental-api)=

## Experimental API
Expand Down
1 change: 1 addition & 0 deletions src/anndata/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,4 +62,5 @@ def __getattr__(attr_name: str) -> Any:
"settings",
"types",
"typing",
"utils",
]
2 changes: 1 addition & 1 deletion src/anndata/typing.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
from .compat import Index as _Index

if TYPE_CHECKING:
from typing import TypeAlias
from typing import AxisStorable, TypeAlias


__all__ = ["AxisStorable", "Index", "RWAble"]
Expand Down
92 changes: 91 additions & 1 deletion src/anndata/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@

if TYPE_CHECKING:
from collections.abc import Iterable, Mapping, Sequence
from typing import Any, Literal
from typing import Any, AxisStorable, Literal

logger = get_logger(__name__)

Expand Down Expand Up @@ -450,3 +450,93 @@
return getattr(mod, new_path)
msg = f"module {full_old_module_path} has no attribute {attr_name!r}"
raise AttributeError(msg)


def adapt_vars_like(
source: AnnData,
target: AnnData,
fill_value: float = 0.0,
) -> AnnData:
"""
Adapt the `.var` structure of `target` to match that of `source`.

This function makes sure that the `target` AnnData object has the same set
of genes (`.var_names`) as the `source` AnnData object. It fills in the
any missing genes in the `target` object with a specified `fill_value`.

Parameters
----------
source
Reference AnnData object whose genes (.var) define the desired structure.
target
AnnData object to be adapted to match the source's gene structure.
fill_value
Value used to fill in missing genes. Defaults to 0.0.

Returns
-------
AnnData
A new AnnData object with the genes matching the source's structure and data from
`target`, with missing values filled in with `fill_value`.

"""
# importing here to avoid circular import issues
from ._core.anndata import AnnData
from ._core.merge import Reindexer

# copy over the var structure from source = becomes new feature index structure
# that we want target to match
new_var = source.var.copy()
# Initializing reindexer = help map the old gene indices to the new structure
reindexer = Reindexer(target.var.index, new_var.index)
# if target object actually has .X matrix (i.e. expression data), I reindex it to match the source

if target.X is not None:
new_X = reindexer(target.X, axis=1, fill_value=fill_value)
else:
# otherwise I just create a dummy matrix of the right shape filled with a constant value
new_X = np.full((target.n_obs, len(new_var)), fill_value)

Check warning on line 498 in src/anndata/utils.py

View check run for this annotation

Codecov / codecov/patch

src/anndata/utils.py#L498

Added line #L498 was not covered by tests
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No need to create a new X, I think. Things should work in its absence, no?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, Reindexer(target.X) won’t crash if target.X is None, it’ll just return None. But the point of the if/else is to make sure new_X is always a proper matrix with the right shape, filled with fill_value, so things downstream don’t break. If we drop the if, we’d end up passing None into places that probably assume a real array. That’s how I interpreted it at least but let me know if that assumption doesn’t hold.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok I think my point here was that if targethas no X we should probably not create an empty one. We can just set X=None when declaring the new AnnData instead of https://github.com/scverse/anndata/pull/1986/files#diff-22197e419767db6d7078531198e8c055d27d35510281a164e8343ec48fa9a938R523 setting it to this np.full

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Got it. I addressed that issue!


# reindexing each layer matrix along the gene (column) axis so it matches new structure
new_layers = {
k: reindexer(v, axis=1, fill_value=fill_value) for k, v in target.layers.items()
}
# reindex varm which stores matrix-like annotation for each gene
# for each entry, reindex along the gene axis, cast it to a numpy array to make it uniform
# convert it to a plain python list to avoid type checker error
new_varm: AxisStorable = {
k: reindexer(v, axis=0, fill_value=fill_value) for k, v in target.varm.items()
}

new_varp: AxisStorable = {
k: reindexer(
reindexer(v, axis=0, fill_value=fill_value), axis=1, fill_value=fill_value
)
for k, v in target.varp.items()
}

new_obsp = {k: v.copy() for k, v in target.obsp.items()}

# creating new Anndata Object
# directly copying .obs without changes - we are not touching the cells here, just aligning features
new_adata = AnnData(
X=new_X,
obs=target.obs.copy(),
var=new_var,
varm=new_varm,
varp=new_varp,
obsp=new_obsp,
layers=new_layers,
)
# if the original target fad a .raw layer, reindex it as well
# since .raw is immutable (from what I understand), we create a new AnnData object
# with matching .X, .obs, .var assigning it directly to new_adata.raw
if target.raw is not None:
new_raw_X = reindexer(target.raw.X, axis=1, fill_value=fill_value)
new_adata.raw = AnnData(

Check warning on line 536 in src/anndata/utils.py

View check run for this annotation

Codecov / codecov/patch

src/anndata/utils.py#L535-L536

Added lines #L535 - L536 were not covered by tests
X=new_raw_X,
var=source.var.copy(),
obs=target.obs.copy(),
)

return new_adata
55 changes: 54 additions & 1 deletion tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,14 @@

from itertools import repeat

import numpy as np
import pandas as pd
import pytest
from scipy import sparse

import anndata as ad
from anndata.tests.helpers import gen_typed_df
from anndata.utils import make_index_unique
from anndata.utils import adapt_vars_like, make_index_unique


def test_make_index_unique():
Expand Down Expand Up @@ -55,3 +56,55 @@ def test_adata_unique_indices():

pd.testing.assert_index_equal(v.obsm["df"].index, v.obs_names)
pd.testing.assert_index_equal(v.varm["df"].index, v.var_names)


@pytest.mark.parametrize(
("source", "target", "expected_X"),
[
pytest.param(
ad.AnnData(X=np.ones((1, 3)), var=pd.DataFrame(index=["a", "b", "c"])),
ad.AnnData(
X=np.array([[1, 2, 3]]), var=pd.DataFrame(index=["a", "b", "c"])
),
np.array([[1, 2, 3]]),
id="exact_match",
),
pytest.param(
ad.AnnData(X=np.ones((1, 3)), var=pd.DataFrame(index=["a", "b", "c"])),
ad.AnnData(
X=np.array([[3, 2, 1]]), var=pd.DataFrame(index=["c", "b", "a"])
),
np.array([[1, 2, 3]]),
id="different_order",
),
],
)
def test_adapt_vars(source, target, expected_X):
output = adapt_vars_like(source, target)
np.testing.assert_array_equal(output.X, expected_X)
assert list(output.var_names) == list(source.var_names)


@pytest.mark.parametrize(
("source", "target", "fill_value", "expected_X"),
[
pytest.param(
ad.AnnData(X=np.ones((1, 2)), var=pd.DataFrame(index=["g1", "g2"])),
ad.AnnData(X=np.array([[7, 8]]), var=pd.DataFrame(index=["g3", "g4"])),
0.5,
np.array([[0.5, 0.5]]),
id="no_shared_genes",
),
pytest.param(
ad.AnnData(X=np.ones((1, 3)), var=pd.DataFrame(index=["g1", "g2", "g3"])),
ad.AnnData(X=np.array([[1, 3]]), var=pd.DataFrame(index=["g1", "g3"])),
-1,
np.array([[1, -1, 3]]),
id="missing_genes",
),
],
)
def test_adapt_vars_with_fill_value(source, target, fill_value, expected_X):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's merge this test and test_adapt_vars and have a fill value of None for the test_adapt_vars ones in the param

output = adapt_vars_like(source, target, fill_value=fill_value)
np.testing.assert_array_equal(output.X, expected_X)
assert list(output.var_names) == list(source.var_names)
Loading