Skip to content
Open
Show file tree
Hide file tree
Changes from 10 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions docs/api.md
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,20 @@ Writing formats that cannot represent all aspects of {class}`AnnData` objects.
AnnData.write_loom
```

(utilities-api)=

## Utilities

Helper functions used internationally or for reshaping and aligng `AnnData` objects. Can be useful for cusotm workflows or edge cases.

```{eval-rst}
.. autosummary::
:toctree: generated/

utils.adapt_vars_like

```

(experimental-api)=

## Experimental API
Expand Down
3 changes: 2 additions & 1 deletion src/anndata/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
WriteWarning,
)
from .io import read_h5ad, read_zarr
from .utils import module_get_attr_redirect
from .utils import adapt_vars_like, module_get_attr_redirect

# Submodules need to be imported last
from . import abc, experimental, typing, io, types # isort: skip
Expand Down Expand Up @@ -53,6 +53,7 @@ def __getattr__(attr_name: str) -> Any:
"WriteWarning",
"__version__",
"abc",
"adapt_vars_like",
"concat",
"experimental",
"io",
Expand Down
68 changes: 68 additions & 0 deletions src/anndata/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -450,3 +450,71 @@
return getattr(mod, new_path)
msg = f"module {full_old_module_path} has no attribute {attr_name!r}"
raise AttributeError(msg)


def adapt_vars_like(
source: AnnData,
target: AnnData,
fill_value: float = 0.0,
) -> AnnData:
# source = AnnData object that defines the desired genes
# target = the data you want to reshape to match source
# fill_vlaue = what value to use for missing genes (default set to 0.0)
# returns a new AnnData object with the same genes as source
"""
Adapt the `.var` structure of `target` to match that of `source`.
This function makes sure that the `target` AnnData object has the same set
of genes (`.var_names`) as the `source` AnnData object. It fills in the
any missing genes in the `target` object with a specified `fill_value`.
Parameters
----------
source
Refernece AnnData object whose genes (.var) define the desired structure.

Check failure on line 474 in src/anndata/utils.py

View workflow job for this annotation

GitHub Actions / Check for spelling errors

Refernece ==> Reference
target
AnnData object to be adapted to match the source's gene structure.
fill_value
Value used to fill in missing genes. Defaults to 0.0.
Returns
-------
AnnData
A new AnnData object with the genes matching the source's structure and data from
`target`, with missing values filled in with `fill_value`.
"""
# importing here to avoid circular import issues
from ._core.anndata import AnnData
from ._core.merge import Reindexer

# needed to add it as when trying to call target.X[:, target.var.index]
# it would raise an error if target.X is None
if target.X is None:
msg = "target.X is None; cannot adapt vars without a data matrix."
raise ValueError(msg)
# making a copy to use in the new AnnData object returned later
new_var = source.var.copy()
# handling the case when not all source genes are in target
if not source.var_names.isin(target.var_names).all():
# manual fix
# computing the list of genes that are in source and target
shared = source.var_names.intersection(target.var_names)
# getting positions of the shared genes in source and target
source_idx = new_var.index.get_indexer(shared)
target_idx = target.var_names.get_indexer(shared)
# creating a new matrix of shape (number of cells, number of genes in source)
# filled with the fill_value
new_x = np.full((target.n_obs, new_var.shape[0]), fill_value)
# for the genes that are in both source and target, copy over the values
new_x[:, source_idx] = target.X[:, target_idx]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does ReIndexer not also handle this? Might be worth adding this case to the ReIndexer in that case. In any case, we need to do this "operation" over all parts of the AnnData object.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I double-checked, and it does. I rewrote this function, so if you wouldn't mind checking it, I'd appreciate it!

else:
# in other cases just use reindexer
reindexer = Reindexer(new_var.index, target.var.index)
new_x = reindexer(target.X, fill_value=fill_value)
# creates a new AnnData object with the new .X and .var
# .X is the filled new_x array
# .obs is a copy of the target.obs
# .var is copied from source.var, making sure alignment of gene annotations
new_adata = AnnData(X=new_x, obs=target.obs.copy(), var=new_var)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We'll want to do the whole AnnData object. So you'll need to use the ReIndexer (which operates on all types of matrices, dataframes etc) on all the parts of the object, I think, something like

reindexer = Reindexer(new_var.index, target.var.index)

AnnData(X=reindexer(target.X, fill_value=fill_value), obs=reindexer(target.obs, fill_value=fill_value), obsm={k: reindexer(v, fill_value=fill_value) for k, v in obsm.items()}...)

and so forth. Does that make sense?

return new_adata
47 changes: 46 additions & 1 deletion tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,14 @@

from itertools import repeat

import numpy as np
import pandas as pd
import pytest
from scipy import sparse

import anndata as ad
from anndata.tests.helpers import gen_typed_df
from anndata.utils import make_index_unique
from anndata.utils import adapt_vars_like, make_index_unique


def test_make_index_unique():
Expand Down Expand Up @@ -55,3 +56,47 @@ def test_adata_unique_indices():

pd.testing.assert_index_equal(v.obsm["df"].index, v.obs_names)
pd.testing.assert_index_equal(v.varm["df"].index, v.var_names)


def test_adapt_vars_exact_match():
# Test that adapt_vars_like works when the source and target have the same var names
# and the same number of variables
source = ad.AnnData(X=np.ones((1, 3)), var=pd.DataFrame(index=["a", "b", "c"]))
target = ad.AnnData(
X=np.array([[1, 2, 3]]), var=pd.DataFrame(index=["a", "b", "c"])
)
output = adapt_vars_like(source, target)
np.testing.assert_array_equal(output.X, target.X)
assert (output.var.index == target.var.index).all()


def test_adapt_vars_different_order():
# Test that adapt_vars_like works when the source and target have the same var names
# but in a different order
source = ad.AnnData(X=np.ones((1, 3)), var=pd.DataFrame(index=["a", "b", "c"]))
target = ad.AnnData(
X=np.array([[3, 2, 1]]), var=pd.DataFrame(index=["c", "b", "a"])
)
output = adapt_vars_like(source, target)
np.testing.assert_array_equal(output.X, [[1, 2, 3]])


def test_adapt_vars_none_X_raises():
source = ad.AnnData(X=np.ones((1, 2)), var=pd.DataFrame(index=["g1", "g2"]))
target = ad.AnnData(X=None, var=pd.DataFrame(index=["g1", "g2"]))
with pytest.raises(ValueError, match="target.X is None"):
adapt_vars_like(source, target)


def test_adapt_vars_no_shared_genes():
source = ad.AnnData(X=np.ones((1, 2)), var=pd.DataFrame(index=["g1", "g2"]))
target = ad.AnnData(X=np.array([[7, 8]]), var=pd.DataFrame(index=["g3", "g4"]))
output = adapt_vars_like(source, target, fill_value=0.5)
np.testing.assert_array_equal(output.X, [[0.5, 0.5]])


def test_adapt_vars_missing_genes():
source = ad.AnnData(X=np.ones((1, 3)), var=pd.DataFrame(index=["g1", "g2", "g3"]))
target = ad.AnnData(X=np.array([[1, 3]]), var=pd.DataFrame(index=["g1", "g3"]))
output = adapt_vars_like(source, target, fill_value=-1)
np.testing.assert_array_equal(output.X, [[1, -1, 3]])
Loading