From 06bb519daabd03e8026639609f73f8cccd247f6a Mon Sep 17 00:00:00 2001 From: amalia-k510 Date: Wed, 14 May 2025 12:40:17 +0200 Subject: [PATCH 01/17] gene panel selection feature --- src/anndata/utils.py | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/src/anndata/utils.py b/src/anndata/utils.py index 21ca8d06b..959ce8a18 100644 --- a/src/anndata/utils.py +++ b/src/anndata/utils.py @@ -20,6 +20,8 @@ from collections.abc import Iterable, Mapping, Sequence from typing import Any, Literal + from ._core.anndata import AnnData + logger = get_logger(__name__) @@ -450,3 +452,36 @@ def module_get_attr_redirect( return getattr(mod, new_path) msg = f"module {full_old_module_path} has no attribute {attr_name!r}" raise AttributeError(msg) + + +def adapt_vars_like( + source: AnnData, target: AnnData, fill_value: float = 0.0 +) -> AnnData: + """ + Make target have the same .var (genes) as source., missing genes are filled with fill_value. + """ + # needed to add it as when trying to call target.X[:, target.var.index] + # it would raise an error if target.X is None + if target.X is None: + msg = "target.X is None; cannot adapt vars without a data matrix." + raise ValueError(msg) + # this will become the .var of returned AnnData + new_var = source.var.copy() + # this will become the .X matrix. Makes sure all genes in source are + # represented, and placeholders are ready for copying shared ones + new_x = np.full((target.n_obs, new_var.shape[0]), fill_value, dtype=target.X.dtype) + + shared_genes = source.var_names.intersection(target.var_names) + # positions of shared genes in source + source_idx = new_var.index.get_indexer(shared_genes) + # positions of those same genes in target + target_idx = target.var.index.get_indexer(shared_genes) + # fills the new .X array for all target cells + # inserts expression values from target.X into the correct columns of + # the new_x that match shared genes + # only genes in both source and target are copied over. Everything else + # remains at fill_value + new_x[:, source_idx] = target.X[:, target_idx] + # creates a new AnnData object with the new .X and .var + new_adata = AnnData(X=new_x, obs=target.obs.copy(), var=new_var) + return new_adata From 24da34527a87e3201ee2d70238893daa38c63998 Mon Sep 17 00:00:00 2001 From: amalia-k510 Date: Wed, 14 May 2025 13:02:06 +0200 Subject: [PATCH 02/17] comments fix --- src/anndata/utils.py | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/src/anndata/utils.py b/src/anndata/utils.py index 959ce8a18..0364072e7 100644 --- a/src/anndata/utils.py +++ b/src/anndata/utils.py @@ -457,6 +457,10 @@ def module_get_attr_redirect( def adapt_vars_like( source: AnnData, target: AnnData, fill_value: float = 0.0 ) -> AnnData: + # source = AnnData object that defines the desired genes + # target = the data you want to reshape to match source + # fill_vlaue = what value to use for missing genes (default set to 0.0) + # returns a new AnnData object with the same genes as source """ Make target have the same .var (genes) as source., missing genes are filled with fill_value. """ @@ -465,23 +469,28 @@ def adapt_vars_like( if target.X is None: msg = "target.X is None; cannot adapt vars without a data matrix." raise ValueError(msg) - # this will become the .var of returned AnnData + # this will define the gene list we want to match new_var = source.var.copy() - # this will become the .X matrix. Makes sure all genes in source are - # represented, and placeholders are ready for copying shared ones + # initializing a new dense np array of shape (number of target cells, number of genes in source) + # filled with fill_value + # this will become the new .X matrix. + # It makes sure all genes in source are represented, and placeholders are ready for copying shared ones new_x = np.full((target.n_obs, new_var.shape[0]), fill_value, dtype=target.X.dtype) - + # finds gene names that appeare in both source and target shared_genes = source.var_names.intersection(target.var_names) # positions of shared genes in source source_idx = new_var.index.get_indexer(shared_genes) # positions of those same genes in target target_idx = target.var.index.get_indexer(shared_genes) - # fills the new .X array for all target cells - # inserts expression values from target.X into the correct columns of - # the new_x that match shared genes - # only genes in both source and target are copied over. Everything else - # remains at fill_value + # fills the new .X array for all target cells (rows) + # also inserts expression values from target.X into the correct columns of new_x + # for the shared genes + # only genes in both source and target are copied over. + # everything else remains at fill_value new_x[:, source_idx] = target.X[:, target_idx] # creates a new AnnData object with the new .X and .var + # .X is the filled new_x array + # .obs is a copy of the target.obs + # .var is copied from source.var, making sure alignment of gene annotations new_adata = AnnData(X=new_x, obs=target.obs.copy(), var=new_var) return new_adata From 74b3ebd3b2b2b7bda7c8c0220353b9bce6a7e22e Mon Sep 17 00:00:00 2001 From: amalia-k510 Date: Wed, 14 May 2025 13:08:52 +0200 Subject: [PATCH 03/17] import error fix --- src/anndata/utils.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/anndata/utils.py b/src/anndata/utils.py index 0364072e7..19b60aafa 100644 --- a/src/anndata/utils.py +++ b/src/anndata/utils.py @@ -12,6 +12,7 @@ import anndata +from ._core.anndata import AnnData from ._core.sparse_dataset import BaseCompressedSparseDataset from .compat import CSArray, CupyArray, CupySparseMatrix, DaskArray from .logging import get_logger @@ -20,8 +21,6 @@ from collections.abc import Iterable, Mapping, Sequence from typing import Any, Literal - from ._core.anndata import AnnData - logger = get_logger(__name__) From aa2295fa39f799427e23c8caa513ebc0118dba5a Mon Sep 17 00:00:00 2001 From: amalia-k510 Date: Wed, 14 May 2025 13:31:55 +0200 Subject: [PATCH 04/17] import error fix and init script update to make new fxn accessible --- src/anndata/__init__.py | 3 ++- src/anndata/utils.py | 4 +++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/src/anndata/__init__.py b/src/anndata/__init__.py index 5925837f6..bb3b0c58c 100644 --- a/src/anndata/__init__.py +++ b/src/anndata/__init__.py @@ -20,7 +20,7 @@ WriteWarning, ) from .io import read_h5ad, read_zarr -from .utils import module_get_attr_redirect +from .utils import adapt_vars_like, module_get_attr_redirect # Submodules need to be imported last from . import abc, experimental, typing, io, types # isort: skip @@ -53,6 +53,7 @@ def __getattr__(attr_name: str) -> Any: "WriteWarning", "__version__", "abc", + "adapt_vars_like", "concat", "experimental", "io", diff --git a/src/anndata/utils.py b/src/anndata/utils.py index 19b60aafa..38a7c575f 100644 --- a/src/anndata/utils.py +++ b/src/anndata/utils.py @@ -12,7 +12,6 @@ import anndata -from ._core.anndata import AnnData from ._core.sparse_dataset import BaseCompressedSparseDataset from .compat import CSArray, CupyArray, CupySparseMatrix, DaskArray from .logging import get_logger @@ -463,6 +462,9 @@ def adapt_vars_like( """ Make target have the same .var (genes) as source., missing genes are filled with fill_value. """ + # importing here to avoid circular import issues + from ._core.anndata import AnnData + # needed to add it as when trying to call target.X[:, target.var.index] # it would raise an error if target.X is None if target.X is None: From 16264bf57f7caaa1ce6a4bbbff32218cd8c25faf Mon Sep 17 00:00:00 2001 From: amalia-k510 Date: Sun, 18 May 2025 14:53:46 +0200 Subject: [PATCH 05/17] doc string fix and api.md added --- docs/api.md | 14 ++++++++++++++ src/anndata/utils.py | 26 ++++++++++++++++++++++++-- 2 files changed, 38 insertions(+), 2 deletions(-) diff --git a/docs/api.md b/docs/api.md index ba6de634d..801397f21 100644 --- a/docs/api.md +++ b/docs/api.md @@ -118,6 +118,18 @@ Writing formats that cannot represent all aspects of {class}`AnnData` objects. AnnData.write_loom ``` +(utilities-api)= +## Utilities +Helper functions used internationally or for reshaping and aligng `AnnData` objects. Can be useful for cusotm workflows or edge cases. + +```{eval-rst} +.. autosummary:: + :toctree: generated/ + + utils.adapt_vars_like + +``` + (experimental-api)= ## Experimental API @@ -204,6 +216,8 @@ Types used by the former: types.ExtensionNamespace ``` + + (errors-api)= ## Errors and warnings diff --git a/src/anndata/utils.py b/src/anndata/utils.py index 38a7c575f..0685c769c 100644 --- a/src/anndata/utils.py +++ b/src/anndata/utils.py @@ -453,14 +453,36 @@ def module_get_attr_redirect( def adapt_vars_like( - source: AnnData, target: AnnData, fill_value: float = 0.0 + source: AnnData, + target: AnnData, + fill_value: float = 0.0, ) -> AnnData: # source = AnnData object that defines the desired genes # target = the data you want to reshape to match source # fill_vlaue = what value to use for missing genes (default set to 0.0) # returns a new AnnData object with the same genes as source """ - Make target have the same .var (genes) as source., missing genes are filled with fill_value. + Adapt the `.var` structure of `target` to match that of `source`. + + This function makes sure that the `target` AnnData object has the same set + of genes (`.var_names`) as the `source` AnnData object. It fills in the + any missing genes in the `target` object with a specified `fill_value`. + + Parameters + ---------- + source + Refernece AnnData object whose genes (.var) define the desired structure. + target + AnnData object to be adapted to match the source's gene structure. + fill_value + Value used to fill in missing genes. Defaults to 0.0. + + Returns + ------- + AnnData + A new AnnData object with the genes matching the source's structure and data from + `target`, with missing values filled in with `fill_value`. + """ # importing here to avoid circular import issues from ._core.anndata import AnnData From 46b728fa55f4a5e6bcf4c4ec23af8bc6e021ee85 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sun, 18 May 2025 12:54:04 +0000 Subject: [PATCH 06/17] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- docs/api.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/api.md b/docs/api.md index 801397f21..af302cb9f 100644 --- a/docs/api.md +++ b/docs/api.md @@ -125,7 +125,7 @@ Helper functions used internationally or for reshaping and aligng `AnnData` obje ```{eval-rst} .. autosummary:: :toctree: generated/ - + utils.adapt_vars_like ``` From c09cf637d5da0bee78ee8c69764dbbec0b72637a Mon Sep 17 00:00:00 2001 From: amalia-k510 Date: Sun, 18 May 2025 15:10:06 +0200 Subject: [PATCH 07/17] typo fix --- docs/api.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/api.md b/docs/api.md index 801397f21..25790c51f 100644 --- a/docs/api.md +++ b/docs/api.md @@ -119,13 +119,15 @@ Writing formats that cannot represent all aspects of {class}`AnnData` objects. ``` (utilities-api)= + ## Utilities + Helper functions used internationally or for reshaping and aligng `AnnData` objects. Can be useful for cusotm workflows or edge cases. ```{eval-rst} .. autosummary:: :toctree: generated/ - + utils.adapt_vars_like ``` @@ -216,8 +218,6 @@ Types used by the former: types.ExtensionNamespace ``` - - (errors-api)= ## Errors and warnings From 0dd787886838506a6be2fa771160e087b3857949 Mon Sep 17 00:00:00 2001 From: amalia-k510 Date: Sun, 18 May 2025 15:49:16 +0200 Subject: [PATCH 08/17] Switch to reindex --- src/anndata/utils.py | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/src/anndata/utils.py b/src/anndata/utils.py index 0685c769c..3c132baa1 100644 --- a/src/anndata/utils.py +++ b/src/anndata/utils.py @@ -486,6 +486,7 @@ def adapt_vars_like( """ # importing here to avoid circular import issues from ._core.anndata import AnnData + from ._core.merge import Reindexer # needed to add it as when trying to call target.X[:, target.var.index] # it would raise an error if target.X is None @@ -498,19 +499,8 @@ def adapt_vars_like( # filled with fill_value # this will become the new .X matrix. # It makes sure all genes in source are represented, and placeholders are ready for copying shared ones - new_x = np.full((target.n_obs, new_var.shape[0]), fill_value, dtype=target.X.dtype) - # finds gene names that appeare in both source and target - shared_genes = source.var_names.intersection(target.var_names) - # positions of shared genes in source - source_idx = new_var.index.get_indexer(shared_genes) - # positions of those same genes in target - target_idx = target.var.index.get_indexer(shared_genes) - # fills the new .X array for all target cells (rows) - # also inserts expression values from target.X into the correct columns of new_x - # for the shared genes - # only genes in both source and target are copied over. - # everything else remains at fill_value - new_x[:, source_idx] = target.X[:, target_idx] + reindexer = Reindexer(new_var.index, target.var.index, fill_value=fill_value) + new_x = reindexer(target.X) # creates a new AnnData object with the new .X and .var # .X is the filled new_x array # .obs is a copy of the target.obs From 5d6279ec940012e7987a523ee4137900bb33dfd9 Mon Sep 17 00:00:00 2001 From: amalia-k510 Date: Sun, 18 May 2025 17:16:45 +0200 Subject: [PATCH 09/17] tests and manual fix for the missing genes case --- src/anndata/utils.py | 25 ++++++++++++++++------- tests/test_utils.py | 47 +++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 64 insertions(+), 8 deletions(-) diff --git a/src/anndata/utils.py b/src/anndata/utils.py index 3c132baa1..bedb63947 100644 --- a/src/anndata/utils.py +++ b/src/anndata/utils.py @@ -493,14 +493,25 @@ def adapt_vars_like( if target.X is None: msg = "target.X is None; cannot adapt vars without a data matrix." raise ValueError(msg) - # this will define the gene list we want to match + # making a copy to use in the new AnnData object returned later new_var = source.var.copy() - # initializing a new dense np array of shape (number of target cells, number of genes in source) - # filled with fill_value - # this will become the new .X matrix. - # It makes sure all genes in source are represented, and placeholders are ready for copying shared ones - reindexer = Reindexer(new_var.index, target.var.index, fill_value=fill_value) - new_x = reindexer(target.X) + # handling the case when not all source genes are in target + if not source.var_names.isin(target.var_names).all(): + # manual fix + # computing the list of genes that are in source and target + shared = source.var_names.intersection(target.var_names) + # getting positions of the shared genes in source and target + source_idx = new_var.index.get_indexer(shared) + target_idx = target.var_names.get_indexer(shared) + # creating a new matrix of shape (number of cells, number of genes in source) + # filled with the fill_value + new_x = np.full((target.n_obs, new_var.shape[0]), fill_value) + # for the genes that are in both source and target, copy over the values + new_x[:, source_idx] = target.X[:, target_idx] + else: + # in other cases just use reindexer + reindexer = Reindexer(new_var.index, target.var.index) + new_x = reindexer(target.X, fill_value=fill_value) # creates a new AnnData object with the new .X and .var # .X is the filled new_x array # .obs is a copy of the target.obs diff --git a/tests/test_utils.py b/tests/test_utils.py index f57fc5d6e..ced838f66 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -2,13 +2,14 @@ from itertools import repeat +import numpy as np import pandas as pd import pytest from scipy import sparse import anndata as ad from anndata.tests.helpers import gen_typed_df -from anndata.utils import make_index_unique +from anndata.utils import adapt_vars_like, make_index_unique def test_make_index_unique(): @@ -55,3 +56,47 @@ def test_adata_unique_indices(): pd.testing.assert_index_equal(v.obsm["df"].index, v.obs_names) pd.testing.assert_index_equal(v.varm["df"].index, v.var_names) + + +def test_adapt_vars_exact_match(): + # Test that adapt_vars_like works when the source and target have the same var names + # and the same number of variables + source = ad.AnnData(X=np.ones((1, 3)), var=pd.DataFrame(index=["a", "b", "c"])) + target = ad.AnnData( + X=np.array([[1, 2, 3]]), var=pd.DataFrame(index=["a", "b", "c"]) + ) + output = adapt_vars_like(source, target) + np.testing.assert_array_equal(output.X, target.X) + assert (output.var.index == target.var.index).all() + + +def test_adapt_vars_different_order(): + # Test that adapt_vars_like works when the source and target have the same var names + # but in a different order + source = ad.AnnData(X=np.ones((1, 3)), var=pd.DataFrame(index=["a", "b", "c"])) + target = ad.AnnData( + X=np.array([[3, 2, 1]]), var=pd.DataFrame(index=["c", "b", "a"]) + ) + output = adapt_vars_like(source, target) + np.testing.assert_array_equal(output.X, [[1, 2, 3]]) + + +def test_adapt_vars_none_X_raises(): + source = ad.AnnData(X=np.ones((1, 2)), var=pd.DataFrame(index=["g1", "g2"])) + target = ad.AnnData(X=None, var=pd.DataFrame(index=["g1", "g2"])) + with pytest.raises(ValueError, match="target.X is None"): + adapt_vars_like(source, target) + + +def test_adapt_vars_no_shared_genes(): + source = ad.AnnData(X=np.ones((1, 2)), var=pd.DataFrame(index=["g1", "g2"])) + target = ad.AnnData(X=np.array([[7, 8]]), var=pd.DataFrame(index=["g3", "g4"])) + output = adapt_vars_like(source, target, fill_value=0.5) + np.testing.assert_array_equal(output.X, [[0.5, 0.5]]) + + +def test_adapt_vars_missing_genes(): + source = ad.AnnData(X=np.ones((1, 3)), var=pd.DataFrame(index=["g1", "g2", "g3"])) + target = ad.AnnData(X=np.array([[1, 3]]), var=pd.DataFrame(index=["g1", "g3"])) + output = adapt_vars_like(source, target, fill_value=-1) + np.testing.assert_array_equal(output.X, [[1, -1, 3]]) From a14af7f68e2cfa565ea84d1e956664db38b56daa Mon Sep 17 00:00:00 2001 From: amalia-k510 Date: Mon, 26 May 2025 14:10:00 +0200 Subject: [PATCH 10/17] test fix and comments --- src/anndata/utils.py | 4 --- tests/test_utils.py | 79 ++++++++++++++++++++++++++------------------ 2 files changed, 47 insertions(+), 36 deletions(-) diff --git a/src/anndata/utils.py b/src/anndata/utils.py index bedb63947..a4370ff73 100644 --- a/src/anndata/utils.py +++ b/src/anndata/utils.py @@ -457,10 +457,6 @@ def adapt_vars_like( target: AnnData, fill_value: float = 0.0, ) -> AnnData: - # source = AnnData object that defines the desired genes - # target = the data you want to reshape to match source - # fill_vlaue = what value to use for missing genes (default set to 0.0) - # returns a new AnnData object with the same genes as source """ Adapt the `.var` structure of `target` to match that of `source`. diff --git a/tests/test_utils.py b/tests/test_utils.py index ced838f66..5e8109ef3 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -58,27 +58,31 @@ def test_adata_unique_indices(): pd.testing.assert_index_equal(v.varm["df"].index, v.var_names) -def test_adapt_vars_exact_match(): - # Test that adapt_vars_like works when the source and target have the same var names - # and the same number of variables - source = ad.AnnData(X=np.ones((1, 3)), var=pd.DataFrame(index=["a", "b", "c"])) - target = ad.AnnData( - X=np.array([[1, 2, 3]]), var=pd.DataFrame(index=["a", "b", "c"]) - ) - output = adapt_vars_like(source, target) - np.testing.assert_array_equal(output.X, target.X) - assert (output.var.index == target.var.index).all() - - -def test_adapt_vars_different_order(): - # Test that adapt_vars_like works when the source and target have the same var names - # but in a different order - source = ad.AnnData(X=np.ones((1, 3)), var=pd.DataFrame(index=["a", "b", "c"])) - target = ad.AnnData( - X=np.array([[3, 2, 1]]), var=pd.DataFrame(index=["c", "b", "a"]) - ) +@pytest.mark.parametrize( + ("source", "target", "expected_X"), + [ + pytest.param( + ad.AnnData(X=np.ones((1, 3)), var=pd.DataFrame(index=["a", "b", "c"])), + ad.AnnData( + X=np.array([[1, 2, 3]]), var=pd.DataFrame(index=["a", "b", "c"]) + ), + np.array([[1, 2, 3]]), + id="exact_match", + ), + pytest.param( + ad.AnnData(X=np.ones((1, 3)), var=pd.DataFrame(index=["a", "b", "c"])), + ad.AnnData( + X=np.array([[3, 2, 1]]), var=pd.DataFrame(index=["c", "b", "a"]) + ), + np.array([[1, 2, 3]]), + id="different_order", + ), + ], +) +def test_adapt_vars(source, target, expected_X): output = adapt_vars_like(source, target) - np.testing.assert_array_equal(output.X, [[1, 2, 3]]) + np.testing.assert_array_equal(output.X, expected_X) + assert list(output.var_names) == list(source.var_names) def test_adapt_vars_none_X_raises(): @@ -88,15 +92,26 @@ def test_adapt_vars_none_X_raises(): adapt_vars_like(source, target) -def test_adapt_vars_no_shared_genes(): - source = ad.AnnData(X=np.ones((1, 2)), var=pd.DataFrame(index=["g1", "g2"])) - target = ad.AnnData(X=np.array([[7, 8]]), var=pd.DataFrame(index=["g3", "g4"])) - output = adapt_vars_like(source, target, fill_value=0.5) - np.testing.assert_array_equal(output.X, [[0.5, 0.5]]) - - -def test_adapt_vars_missing_genes(): - source = ad.AnnData(X=np.ones((1, 3)), var=pd.DataFrame(index=["g1", "g2", "g3"])) - target = ad.AnnData(X=np.array([[1, 3]]), var=pd.DataFrame(index=["g1", "g3"])) - output = adapt_vars_like(source, target, fill_value=-1) - np.testing.assert_array_equal(output.X, [[1, -1, 3]]) +@pytest.mark.parametrize( + ("source", "target", "fill_value", "expected_X"), + [ + pytest.param( + ad.AnnData(X=np.ones((1, 2)), var=pd.DataFrame(index=["g1", "g2"])), + ad.AnnData(X=np.array([[7, 8]]), var=pd.DataFrame(index=["g3", "g4"])), + 0.5, + np.array([[0.5, 0.5]]), + id="no_shared_genes", + ), + pytest.param( + ad.AnnData(X=np.ones((1, 3)), var=pd.DataFrame(index=["g1", "g2", "g3"])), + ad.AnnData(X=np.array([[1, 3]]), var=pd.DataFrame(index=["g1", "g3"])), + -1, + np.array([[1, -1, 3]]), + id="missing_genes", + ), + ], +) +def test_adapt_vars_with_fill_value(source, target, fill_value, expected_X): + output = adapt_vars_like(source, target, fill_value=fill_value) + np.testing.assert_array_equal(output.X, expected_X) + assert list(output.var_names) == list(source.var_names) From a30c365ca848e6e441b6b5b4ed6adee41a67b23a Mon Sep 17 00:00:00 2001 From: amalia-k510 Date: Wed, 28 May 2025 14:24:48 +0200 Subject: [PATCH 11/17] reindexer fix --- src/anndata/utils.py | 69 +++++++++++++++++++++++++++----------------- tests/test_utils.py | 7 ----- 2 files changed, 42 insertions(+), 34 deletions(-) diff --git a/src/anndata/utils.py b/src/anndata/utils.py index a4370ff73..51667b7ef 100644 --- a/src/anndata/utils.py +++ b/src/anndata/utils.py @@ -484,33 +484,48 @@ def adapt_vars_like( from ._core.anndata import AnnData from ._core.merge import Reindexer - # needed to add it as when trying to call target.X[:, target.var.index] - # it would raise an error if target.X is None - if target.X is None: - msg = "target.X is None; cannot adapt vars without a data matrix." - raise ValueError(msg) - # making a copy to use in the new AnnData object returned later + # copy over the var structure from source = becomes new feature index structure + # that we want target to match new_var = source.var.copy() - # handling the case when not all source genes are in target - if not source.var_names.isin(target.var_names).all(): - # manual fix - # computing the list of genes that are in source and target - shared = source.var_names.intersection(target.var_names) - # getting positions of the shared genes in source and target - source_idx = new_var.index.get_indexer(shared) - target_idx = target.var_names.get_indexer(shared) - # creating a new matrix of shape (number of cells, number of genes in source) - # filled with the fill_value - new_x = np.full((target.n_obs, new_var.shape[0]), fill_value) - # for the genes that are in both source and target, copy over the values - new_x[:, source_idx] = target.X[:, target_idx] + # Initializing reindexer = help map the old gene indices to the new structure + reindexer = Reindexer(target.var.index, new_var.index) + # if target object actually has .X matrix (i.e. expression data), I reindex it to match the source + + if target.X is not None: + new_X = reindexer(target.X, axis=1, fill_value=fill_value) else: - # in other cases just use reindexer - reindexer = Reindexer(new_var.index, target.var.index) - new_x = reindexer(target.X, fill_value=fill_value) - # creates a new AnnData object with the new .X and .var - # .X is the filled new_x array - # .obs is a copy of the target.obs - # .var is copied from source.var, making sure alignment of gene annotations - new_adata = AnnData(X=new_x, obs=target.obs.copy(), var=new_var) + # otherwise I just create a dummy matrix of the right shape filled with a constant value + new_X = np.full((target.n_obs, len(new_var)), fill_value) + + # reindexing each layer matrix along the gene (column) axis so it matches new structure + new_layers = { + k: reindexer(v, axis=1, fill_value=fill_value) for k, v in target.layers.items() + } + # reindex varm which stores matrix-like annotation for each gene + # for each entry, reindex along the gene axis, cast it to a numpy array to make it uniform + # convert it to a plain python list to avoid type checker error + new_varm: dict[str, Sequence[Any]] = { + k: np.asarray(reindexer(v, axis=0, fill_value=fill_value)).tolist() + for k, v in target.varm.items() + } + # creating new Anndata Object + # directly copying .obs without changes - we ar enot touching the cells here, just aligning features + new_adata = AnnData( + X=new_X, + obs=target.obs.copy(), + var=new_var, + varm=new_varm, + layers=new_layers, + ) + # if the original target fad a .raw layer, reindex it as well + # since .raw is immutable (from what I understand), we create a new AnnData object + # with matching .X, .obs, .var assigning it directly to new_adata.raw + if target.raw is not None: + new_raw_X = reindexer(target.raw.X, axis=1, fill_value=fill_value) + new_adata.raw = AnnData( + X=new_raw_X, + var=source.var.copy(), + obs=target.obs.copy(), + ) + return new_adata diff --git a/tests/test_utils.py b/tests/test_utils.py index 5e8109ef3..36f5a5f39 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -85,13 +85,6 @@ def test_adapt_vars(source, target, expected_X): assert list(output.var_names) == list(source.var_names) -def test_adapt_vars_none_X_raises(): - source = ad.AnnData(X=np.ones((1, 2)), var=pd.DataFrame(index=["g1", "g2"])) - target = ad.AnnData(X=None, var=pd.DataFrame(index=["g1", "g2"])) - with pytest.raises(ValueError, match="target.X is None"): - adapt_vars_like(source, target) - - @pytest.mark.parametrize( ("source", "target", "fill_value", "expected_X"), [ From fa48833089b555f3cb341fb1ffc65256beec6c38 Mon Sep 17 00:00:00 2001 From: amalia-k510 Date: Wed, 28 May 2025 14:25:22 +0200 Subject: [PATCH 12/17] Update __init__.py --- src/anndata/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/anndata/__init__.py b/src/anndata/__init__.py index bb3b0c58c..65c99b0db 100644 --- a/src/anndata/__init__.py +++ b/src/anndata/__init__.py @@ -53,7 +53,6 @@ def __getattr__(attr_name: str) -> Any: "WriteWarning", "__version__", "abc", - "adapt_vars_like", "concat", "experimental", "io", @@ -63,4 +62,5 @@ def __getattr__(attr_name: str) -> Any: "settings", "types", "typing", + "utils", ] From 11fdb500f6def7ee2eef7dd143d1823dc8498eea Mon Sep 17 00:00:00 2001 From: amalia-k510 Date: Wed, 28 May 2025 14:30:42 +0200 Subject: [PATCH 13/17] import error and spelling error --- src/anndata/__init__.py | 2 +- src/anndata/utils.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/anndata/__init__.py b/src/anndata/__init__.py index 65c99b0db..a77b00183 100644 --- a/src/anndata/__init__.py +++ b/src/anndata/__init__.py @@ -20,7 +20,7 @@ WriteWarning, ) from .io import read_h5ad, read_zarr -from .utils import adapt_vars_like, module_get_attr_redirect +from .utils import module_get_attr_redirect # Submodules need to be imported last from . import abc, experimental, typing, io, types # isort: skip diff --git a/src/anndata/utils.py b/src/anndata/utils.py index 51667b7ef..2da35bd71 100644 --- a/src/anndata/utils.py +++ b/src/anndata/utils.py @@ -20,6 +20,7 @@ from collections.abc import Iterable, Mapping, Sequence from typing import Any, Literal + logger = get_logger(__name__) @@ -467,7 +468,7 @@ def adapt_vars_like( Parameters ---------- source - Refernece AnnData object whose genes (.var) define the desired structure. + Reference AnnData object whose genes (.var) define the desired structure. target AnnData object to be adapted to match the source's gene structure. fill_value From 715fd0f7a756cd26985305d2652e0177da9af81d Mon Sep 17 00:00:00 2001 From: amalia-k510 Date: Sun, 1 Jun 2025 14:26:07 -0700 Subject: [PATCH 14/17] AxisStorable implementaiton --- src/anndata/typing.py | 2 +- src/anndata/utils.py | 12 +++++++----- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/src/anndata/typing.py b/src/anndata/typing.py index f0cf974b4..9fa914dcc 100644 --- a/src/anndata/typing.py +++ b/src/anndata/typing.py @@ -22,7 +22,7 @@ from .compat import Index as _Index if TYPE_CHECKING: - from typing import TypeAlias + from typing import AxisStorable, TypeAlias __all__ = ["AxisStorable", "Index", "RWAble"] diff --git a/src/anndata/utils.py b/src/anndata/utils.py index 2da35bd71..104766822 100644 --- a/src/anndata/utils.py +++ b/src/anndata/utils.py @@ -18,8 +18,7 @@ if TYPE_CHECKING: from collections.abc import Iterable, Mapping, Sequence - from typing import Any, Literal - + from typing import Any, AxisStorable, Literal logger = get_logger(__name__) @@ -505,9 +504,12 @@ def adapt_vars_like( # reindex varm which stores matrix-like annotation for each gene # for each entry, reindex along the gene axis, cast it to a numpy array to make it uniform # convert it to a plain python list to avoid type checker error - new_varm: dict[str, Sequence[Any]] = { - k: np.asarray(reindexer(v, axis=0, fill_value=fill_value)).tolist() - for k, v in target.varm.items() + # new_varm: dict[str, Sequence[Any]] = { + # k: np.asarray(reindexer(v, axis=0, fill_value=fill_value)).tolist() + # for k, v in target.varm.items() + # } + new_varm: AxisStorable = { + k: reindexer(v, axis=0, fill_value=fill_value) for k, v in target.varm.items() } # creating new Anndata Object # directly copying .obs without changes - we ar enot touching the cells here, just aligning features From 63a29337c3bc52c083e20cab23121ee2b29245f4 Mon Sep 17 00:00:00 2001 From: amalia-k510 Date: Sun, 1 Jun 2025 14:43:52 -0700 Subject: [PATCH 15/17] adding new_varp and new_obsp for consistency --- src/anndata/utils.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/src/anndata/utils.py b/src/anndata/utils.py index 104766822..7183af796 100644 --- a/src/anndata/utils.py +++ b/src/anndata/utils.py @@ -504,20 +504,28 @@ def adapt_vars_like( # reindex varm which stores matrix-like annotation for each gene # for each entry, reindex along the gene axis, cast it to a numpy array to make it uniform # convert it to a plain python list to avoid type checker error - # new_varm: dict[str, Sequence[Any]] = { - # k: np.asarray(reindexer(v, axis=0, fill_value=fill_value)).tolist() - # for k, v in target.varm.items() - # } new_varm: AxisStorable = { k: reindexer(v, axis=0, fill_value=fill_value) for k, v in target.varm.items() } + + new_varp: AxisStorable = { + k: reindexer( + reindexer(v, axis=0, fill_value=fill_value), axis=1, fill_value=fill_value + ) + for k, v in target.varp.items() + } + + new_obsp = {k: v.copy() for k, v in target.obsp.items()} + # creating new Anndata Object - # directly copying .obs without changes - we ar enot touching the cells here, just aligning features + # directly copying .obs without changes - we are not touching the cells here, just aligning features new_adata = AnnData( X=new_X, obs=target.obs.copy(), var=new_var, varm=new_varm, + varp=new_varp, + obsp=new_obsp, layers=new_layers, ) # if the original target fad a .raw layer, reindex it as well From 8c306469e3087e970290f58aa1d4bb5edb9c088a Mon Sep 17 00:00:00 2001 From: amalia-k510 Date: Mon, 2 Jun 2025 16:35:30 -0700 Subject: [PATCH 16/17] target change to None and test for it --- src/anndata/utils.py | 23 +++++------------------ tests/test_utils.py | 15 +++++++++++++++ 2 files changed, 20 insertions(+), 18 deletions(-) diff --git a/src/anndata/utils.py b/src/anndata/utils.py index 7183af796..a23bc36df 100644 --- a/src/anndata/utils.py +++ b/src/anndata/utils.py @@ -484,26 +484,17 @@ def adapt_vars_like( from ._core.anndata import AnnData from ._core.merge import Reindexer - # copy over the var structure from source = becomes new feature index structure - # that we want target to match new_var = source.var.copy() - # Initializing reindexer = help map the old gene indices to the new structure reindexer = Reindexer(target.var.index, new_var.index) - # if target object actually has .X matrix (i.e. expression data), I reindex it to match the source - - if target.X is not None: - new_X = reindexer(target.X, axis=1, fill_value=fill_value) + if target.X is None: + new_X = None else: - # otherwise I just create a dummy matrix of the right shape filled with a constant value - new_X = np.full((target.n_obs, len(new_var)), fill_value) + new_X = reindexer(target.X, axis=1, fill_value=fill_value) - # reindexing each layer matrix along the gene (column) axis so it matches new structure new_layers = { k: reindexer(v, axis=1, fill_value=fill_value) for k, v in target.layers.items() } - # reindex varm which stores matrix-like annotation for each gene - # for each entry, reindex along the gene axis, cast it to a numpy array to make it uniform - # convert it to a plain python list to avoid type checker error + new_varm: AxisStorable = { k: reindexer(v, axis=0, fill_value=fill_value) for k, v in target.varm.items() } @@ -517,8 +508,6 @@ def adapt_vars_like( new_obsp = {k: v.copy() for k, v in target.obsp.items()} - # creating new Anndata Object - # directly copying .obs without changes - we are not touching the cells here, just aligning features new_adata = AnnData( X=new_X, obs=target.obs.copy(), @@ -528,9 +517,7 @@ def adapt_vars_like( obsp=new_obsp, layers=new_layers, ) - # if the original target fad a .raw layer, reindex it as well - # since .raw is immutable (from what I understand), we create a new AnnData object - # with matching .X, .obs, .var assigning it directly to new_adata.raw + if target.raw is not None: new_raw_X = reindexer(target.raw.X, axis=1, fill_value=fill_value) new_adata.raw = AnnData( diff --git a/tests/test_utils.py b/tests/test_utils.py index 36f5a5f39..9bffc76ae 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -108,3 +108,18 @@ def test_adapt_vars_with_fill_value(source, target, fill_value, expected_X): output = adapt_vars_like(source, target, fill_value=fill_value) np.testing.assert_array_equal(output.X, expected_X) assert list(output.var_names) == list(source.var_names) + + +def test_adapt_vars_target_X_none(): + source = ad.AnnData( + X=np.ones((2, 2)), + var=pd.DataFrame(index=["g1", "g2"]), + ) + target = ad.AnnData( + X=None, + var=pd.DataFrame(index=["g2", "g3"]), + obs=pd.DataFrame(index=["cell1", "cell2"]), + ) + output = adapt_vars_like(source, target, fill_value=-1) + assert output.X is None + assert list(output.var_names) == list(source.var_names) From ac4c067158ecea8b15ec351dee7770197f528df1 Mon Sep 17 00:00:00 2001 From: amalia-k510 Date: Mon, 2 Jun 2025 17:05:53 -0700 Subject: [PATCH 17/17] testing all aspects --- tests/test_utils.py | 57 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) diff --git a/tests/test_utils.py b/tests/test_utils.py index 9bffc76ae..0c9bba6ad 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -123,3 +123,60 @@ def test_adapt_vars_target_X_none(): output = adapt_vars_like(source, target, fill_value=-1) assert output.X is None assert list(output.var_names) == list(source.var_names) + + +def test_adapt_vars_all_objects(): + source = ad.AnnData( + X=np.ones((2, 3)), + var=gen_typed_df(3, index=pd.Index(["a", "b", "c"])), + ) + + target = ad.AnnData( + X=np.array([[1, 3], [2, 4]]), + var=gen_typed_df(2, index=pd.Index(["a", "c"])), + obs=pd.DataFrame(index=["cell1", "cell2"]), + varm={"varm_key": np.array([[10, 11], [30, 31]])}, + varp={"varp_key": np.array([[1, 2], [3, 4]])}, + obsp={"obsp_key": np.array([[5, 6], [7, 8]])}, + layers={"layer1": np.array([[1000, 3000], [1001, 3001]])}, + ) + + output = adapt_vars_like(source, target, fill_value=-1) + + expected_X = np.array( + [ + [1, -1, 3], + [2, -1, 4], + ] + ) + np.testing.assert_array_equal(output.X, expected_X) + assert list(output.var_names) == ["a", "b", "c"] + + expected_layer = np.array( + [ + [1000, -1, 3000], + [1001, -1, 3001], + ] + ) + np.testing.assert_array_equal(output.layers["layer1"], expected_layer) + + expected_varm = np.array( + [ + [10, 11], + [-1, -1], + [30, 31], + ] + ) + np.testing.assert_array_equal(output.varm["varm_key"], expected_varm) + + expected_varp = np.array( + [ + [1, -1, 2], + [-1, -1, -1], + [3, -1, 4], + ] + ) + np.testing.assert_array_equal(output.varp["varp_key"], expected_varp) + + expected_obsp = np.array([[5, 6], [7, 8]]) + np.testing.assert_array_equal(output.obsp["obsp_key"], expected_obsp)