scverse · amalia-k510 · May 14, 2025 · May 14, 2025 · May 14, 2025 · May 14, 2025
diff --git a/docs/api.md b/docs/api.md
@@ -118,6 +118,20 @@ Writing formats that cannot represent all aspects of {class}`AnnData` objects.
    AnnData.write_loom
 ```
 
+(utilities-api)=
+
+## Utilities
+
+Helper functions used internationally or for reshaping and aligng `AnnData` objects. Can be useful for cusotm workflows or edge cases.
+
+```{eval-rst}
+.. autosummary::
+	:toctree: generated/
+
+	utils.adapt_vars_like
+
+```
+
 (experimental-api)=
 
 ## Experimental API

diff --git a/src/anndata/__init__.py b/src/anndata/__init__.py
@@ -62,4 +62,5 @@ def __getattr__(attr_name: str) -> Any:
     "settings",
     "types",
     "typing",
+    "utils",
 ]
diff --git a/src/anndata/utils.py b/src/anndata/utils.py
@@ -20,6 +20,7 @@
     from collections.abc import Iterable, Mapping, Sequence
     from typing import Any, Literal
 
+
 logger = get_logger(__name__)
 
 
@@ -450,3 +451,82 @@ def module_get_attr_redirect(
         return getattr(mod, new_path)
     msg = f"module {full_old_module_path} has no attribute {attr_name!r}"
     raise AttributeError(msg)
+
+
+def adapt_vars_like(
+    source: AnnData,
+    target: AnnData,
+    fill_value: float = 0.0,
+) -> AnnData:
+    """
+    Adapt the `.var` structure of `target` to match that of `source`.
+
+    This function makes sure that the `target` AnnData object has the same set
+    of genes (`.var_names`) as the `source` AnnData object. It fills in the
+    any missing genes in the `target` object with a specified `fill_value`.
+
+    Parameters
+    ----------
+    source
+        Reference AnnData object whose genes (.var) define the desired structure.
+    target
+        AnnData object to be adapted to match the source's gene structure.
+    fill_value
+        Value used to fill in missing genes. Defaults to 0.0.
+
+    Returns
+    -------
+    AnnData
+        A new AnnData object with the genes matching the source's structure and data from
+        `target`, with missing values filled in with `fill_value`.
+
+    """
+    # importing here to avoid circular import issues
+    from ._core.anndata import AnnData
+    from ._core.merge import Reindexer
+
+    # copy over the var structure from source = becomes new feature index structure
+    # that we want target to match
+    new_var = source.var.copy()
+    # Initializing reindexer = help map the old gene indices to the new structure
+    reindexer = Reindexer(target.var.index, new_var.index)
+    # if target object actually has .X matrix (i.e. expression data), I reindex it to match the source
+
+    if target.X is not None:
+        new_X = reindexer(target.X, axis=1, fill_value=fill_value)
+    else:
+        # otherwise I just create a dummy matrix of the right shape filled with a constant value
+        new_X = np.full((target.n_obs, len(new_var)), fill_value)
+
+    # reindexing each layer matrix along the gene (column) axis so it matches new structure
+    new_layers = {
+        k: reindexer(v, axis=1, fill_value=fill_value) for k, v in target.layers.items()
+    }
+    # reindex varm which stores matrix-like annotation for each gene
+    # for each entry, reindex along the gene axis, cast it to a numpy array to make it uniform
+    # convert it to a plain python list to avoid type checker error
+    new_varm: dict[str, Sequence[Any]] = {
+        k: np.asarray(reindexer(v, axis=0, fill_value=fill_value)).tolist()
+        for k, v in target.varm.items()
+    }
+    # creating new Anndata Object
+    # directly copying .obs without changes - we ar enot touching the cells here, just aligning features
+    new_adata = AnnData(
+        X=new_X,
+        obs=target.obs.copy(),
+        var=new_var,
+        varm=new_varm,
+        layers=new_layers,
+    )
+    # if the original target fad a .raw layer, reindex it as well
+    # since .raw is immutable (from what I understand), we create a new AnnData object
+    # with matching .X, .obs, .var assigning it directly to new_adata.raw
+    if target.raw is not None:
+        new_raw_X = reindexer(target.raw.X, axis=1, fill_value=fill_value)
+        new_adata.raw = AnnData(
+            X=new_raw_X,
+            var=source.var.copy(),
+            obs=target.obs.copy(),
+        )
+
+    return new_adata
diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -2,13 +2,14 @@
 
 from itertools import repeat
 
+import numpy as np
 import pandas as pd
 import pytest
 from scipy import sparse
 
 import anndata as ad
 from anndata.tests.helpers import gen_typed_df
-from anndata.utils import make_index_unique
+from anndata.utils import adapt_vars_like, make_index_unique
 
 
 def test_make_index_unique():
@@ -55,3 +56,55 @@ def test_adata_unique_indices():
 
     pd.testing.assert_index_equal(v.obsm["df"].index, v.obs_names)
     pd.testing.assert_index_equal(v.varm["df"].index, v.var_names)
+
+
+@pytest.mark.parametrize(
+    ("source", "target", "expected_X"),
+    [
+        pytest.param(
+            ad.AnnData(X=np.ones((1, 3)), var=pd.DataFrame(index=["a", "b", "c"])),
+            ad.AnnData(
+                X=np.array([[1, 2, 3]]), var=pd.DataFrame(index=["a", "b", "c"])
+            ),
+            np.array([[1, 2, 3]]),
+            id="exact_match",
+        ),
+        pytest.param(
+            ad.AnnData(X=np.ones((1, 3)), var=pd.DataFrame(index=["a", "b", "c"])),
+            ad.AnnData(
+                X=np.array([[3, 2, 1]]), var=pd.DataFrame(index=["c", "b", "a"])
+            ),
+            np.array([[1, 2, 3]]),
+            id="different_order",
+        ),
+    ],
+)
+def test_adapt_vars(source, target, expected_X):
+    output = adapt_vars_like(source, target)
+    np.testing.assert_array_equal(output.X, expected_X)
+    assert list(output.var_names) == list(source.var_names)
+
+
+@pytest.mark.parametrize(
+    ("source", "target", "fill_value", "expected_X"),
+    [
+        pytest.param(
+            ad.AnnData(X=np.ones((1, 2)), var=pd.DataFrame(index=["g1", "g2"])),
+            ad.AnnData(X=np.array([[7, 8]]), var=pd.DataFrame(index=["g3", "g4"])),
+            0.5,
+            np.array([[0.5, 0.5]]),
+            id="no_shared_genes",
+        ),
+        pytest.param(
+            ad.AnnData(X=np.ones((1, 3)), var=pd.DataFrame(index=["g1", "g2", "g3"])),
+            ad.AnnData(X=np.array([[1, 3]]), var=pd.DataFrame(index=["g1", "g3"])),
+            -1,
+            np.array([[1, -1, 3]]),
+            id="missing_genes",
+        ),
+    ],
+)
+def test_adapt_vars_with_fill_value(source, target, fill_value, expected_X):
+    output = adapt_vars_like(source, target, fill_value=fill_value)
+    np.testing.assert_array_equal(output.X, expected_X)
+    assert list(output.var_names) == list(source.var_names)