scverse · amalia-k510 · May 14, 2025 · May 14, 2025 · May 14, 2025 · May 14, 2025
diff --git a/docs/api.md b/docs/api.md
@@ -118,6 +118,20 @@ Writing formats that cannot represent all aspects of {class}`AnnData` objects.
    AnnData.write_loom
 ```
 
+(utilities-api)=
+
+## Utilities
+
+Helper functions used internationally or for reshaping and aligng `AnnData` objects. Can be useful for cusotm workflows or edge cases.
+
+```{eval-rst}
+.. autosummary::
+	:toctree: generated/
+
+	utils.adapt_vars_like
+
+```
+
 (experimental-api)=
 
 ## Experimental API

diff --git a/src/anndata/__init__.py b/src/anndata/__init__.py
@@ -20,7 +20,7 @@
     WriteWarning,
 )
 from .io import read_h5ad, read_zarr
-from .utils import module_get_attr_redirect
+from .utils import adapt_vars_like, module_get_attr_redirect
 
 # Submodules need to be imported last
 from . import abc, experimental, typing, io, types  # isort: skip
@@ -53,6 +53,7 @@ def __getattr__(attr_name: str) -> Any:
     "WriteWarning",
     "__version__",
     "abc",
+    "adapt_vars_like",
     "concat",
     "experimental",
     "io",

diff --git a/src/anndata/utils.py b/src/anndata/utils.py
@@ -450,3 +450,71 @@
         return getattr(mod, new_path)
     msg = f"module {full_old_module_path} has no attribute {attr_name!r}"
     raise AttributeError(msg)
+
+
+def adapt_vars_like(
+    source: AnnData,
+    target: AnnData,
+    fill_value: float = 0.0,
+) -> AnnData:
+    # source = AnnData object that defines the desired genes
+    # target = the data you want to reshape to match source
+    # fill_vlaue = what value to use for missing genes (default set to 0.0)
+    # returns a new AnnData object with the same genes as source
+    """
+    Adapt the `.var` structure of `target` to match that of `source`.
+
+    This function makes sure that the `target` AnnData object has the same set
+    of genes (`.var_names`) as the `source` AnnData object. It fills in the
+    any missing genes in the `target` object with a specified `fill_value`.
+
+    Parameters
+    ----------
+    source
+        Refernece AnnData object whose genes (.var) define the desired structure.
+    target
+        AnnData object to be adapted to match the source's gene structure.
+    fill_value
+        Value used to fill in missing genes. Defaults to 0.0.
+
+    Returns
+    -------
+    AnnData
+        A new AnnData object with the genes matching the source's structure and data from
+        `target`, with missing values filled in with `fill_value`.
+
+    """
+    # importing here to avoid circular import issues
+    from ._core.anndata import AnnData
+    from ._core.merge import Reindexer
+
+    # needed to add it as when trying to call target.X[:, target.var.index]
+    # it would raise an error if target.X is None
+    if target.X is None:
+        msg = "target.X is None; cannot adapt vars without a data matrix."
+        raise ValueError(msg)
+    # making a copy to use in the new AnnData object returned later
+    new_var = source.var.copy()
+    # handling the case when not all source genes are in target
+    if not source.var_names.isin(target.var_names).all():
+        # manual fix
+        # computing the list of genes that are in source and target
+        shared = source.var_names.intersection(target.var_names)
+        # getting positions of the shared genes in source and target
+        source_idx = new_var.index.get_indexer(shared)
+        target_idx = target.var_names.get_indexer(shared)
+        # creating a new matrix of shape (number of cells, number of genes in source)
+        # filled with the fill_value
+        new_x = np.full((target.n_obs, new_var.shape[0]), fill_value)
+        # for the genes that are in both source and target, copy over the values
+        new_x[:, source_idx] = target.X[:, target_idx]
+    else:
+        # in other cases just use reindexer
+        reindexer = Reindexer(new_var.index, target.var.index)
+        new_x = reindexer(target.X, fill_value=fill_value)
+    # creates a new AnnData object with the new .X and .var
+    # .X is the filled new_x array
+    # .obs is a copy of the target.obs
+    # .var is copied from source.var, making sure alignment of gene annotations
+    new_adata = AnnData(X=new_x, obs=target.obs.copy(), var=new_var)
+    return new_adata
diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -2,13 +2,14 @@
 
 from itertools import repeat
 
+import numpy as np
 import pandas as pd
 import pytest
 from scipy import sparse
 
 import anndata as ad
 from anndata.tests.helpers import gen_typed_df
-from anndata.utils import make_index_unique
+from anndata.utils import adapt_vars_like, make_index_unique
 
 
 def test_make_index_unique():
@@ -55,3 +56,47 @@ def test_adata_unique_indices():
 
     pd.testing.assert_index_equal(v.obsm["df"].index, v.obs_names)
     pd.testing.assert_index_equal(v.varm["df"].index, v.var_names)
+
+
+def test_adapt_vars_exact_match():
+    # Test that adapt_vars_like works when the source and target have the same var names
+    # and the same number of variables
+    source = ad.AnnData(X=np.ones((1, 3)), var=pd.DataFrame(index=["a", "b", "c"]))
+    target = ad.AnnData(
+        X=np.array([[1, 2, 3]]), var=pd.DataFrame(index=["a", "b", "c"])
+    )
+    output = adapt_vars_like(source, target)
+    np.testing.assert_array_equal(output.X, target.X)
+    assert (output.var.index == target.var.index).all()
+
+
+def test_adapt_vars_different_order():
+    # Test that adapt_vars_like works when the source and target have the same var names
+    # but in a different order
+    source = ad.AnnData(X=np.ones((1, 3)), var=pd.DataFrame(index=["a", "b", "c"]))
+    target = ad.AnnData(
+        X=np.array([[3, 2, 1]]), var=pd.DataFrame(index=["c", "b", "a"])
+    )
+    output = adapt_vars_like(source, target)
+    np.testing.assert_array_equal(output.X, [[1, 2, 3]])
+
+
+def test_adapt_vars_none_X_raises():
+    source = ad.AnnData(X=np.ones((1, 2)), var=pd.DataFrame(index=["g1", "g2"]))
+    target = ad.AnnData(X=None, var=pd.DataFrame(index=["g1", "g2"]))
+    with pytest.raises(ValueError, match="target.X is None"):
+        adapt_vars_like(source, target)
+
+
+def test_adapt_vars_no_shared_genes():
+    source = ad.AnnData(X=np.ones((1, 2)), var=pd.DataFrame(index=["g1", "g2"]))
+    target = ad.AnnData(X=np.array([[7, 8]]), var=pd.DataFrame(index=["g3", "g4"]))
+    output = adapt_vars_like(source, target, fill_value=0.5)
+    np.testing.assert_array_equal(output.X, [[0.5, 0.5]])
+
+
+def test_adapt_vars_missing_genes():
+    source = ad.AnnData(X=np.ones((1, 3)), var=pd.DataFrame(index=["g1", "g2", "g3"]))
+    target = ad.AnnData(X=np.array([[1, 3]]), var=pd.DataFrame(index=["g1", "g3"]))
+    output = adapt_vars_like(source, target, fill_value=-1)
+    np.testing.assert_array_equal(output.X, [[1, -1, 3]])