scanpy/tests/test_combat.py at 99a6e2367b2ecf577a1aabe6cdf19b8218804f3d · LiudengZhang/scanpy · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
from __future__ import annotations

import numpy as np
import pandas as pd
import pytest
from anndata.tests.helpers import assert_equal
from sklearn.metrics import silhouette_score

import scanpy as sc
from scanpy.preprocessing._combat import _design_matrix, _standardize_data


def test_norm():
    # this test trivially checks whether mean normalisation worked

    # load in data
    adata = sc.datasets.blobs()
    key = "blobs"
    data = pd.DataFrame(data=adata.X.T, index=adata.var_names, columns=adata.obs_names)

    # construct a pandas series of the batch annotation
    batch = pd.Series(adata.obs[key])
    model = pd.DataFrame({"batch": batch})

    # standardize the data
    s_data, _design, _var_pooled, _stand_mean = _standardize_data(model, data, "batch")

    assert np.allclose(s_data.mean(axis=1), np.zeros(s_data.shape[0]))


def test_covariates():
    adata = sc.datasets.blobs()
    key = "blobs"

    x1 = sc.pp.combat(adata, key=key, inplace=False)

    np.random.seed(0)
    adata.obs["cat1"] = np.random.binomial(3, 0.5, size=(adata.n_obs))
    adata.obs["cat2"] = np.random.binomial(2, 0.1, size=(adata.n_obs))
    adata.obs["num1"] = np.random.normal(size=(adata.n_obs))

    x2 = sc.pp.combat(
        adata, key=key, covariates=["cat1", "cat2", "num1"], inplace=False
    )
    sc.pp.combat(adata, key=key, covariates=["cat1", "cat2", "num1"], inplace=True)

    assert x1.shape == x2.shape

    df = adata.obs[["cat1", "cat2", "num1", key]]
    batch_cats = adata.obs[key].cat.categories
    design = _design_matrix(df, key, batch_cats)

    assert len(design.columns) == 4 + len(batch_cats) - 1


def test_combat_obs_names():
    # Test for fix to #1170
    x = np.random.random((200, 100))
    obs = pd.DataFrame(
        {"batch": pd.Categorical(np.random.randint(0, 2, 200))},
        index=np.repeat(np.arange(100), 2).astype(str),  # Non-unique index
    )
    with pytest.warns(UserWarning, match="Observation names are not unique"):
        a = sc.AnnData(x, obs)
    with pytest.warns(UserWarning, match="Observation names are not unique"):
        b = a.copy()
    b.obs_names_make_unique()

    sc.pp.combat(a, "batch")
    sc.pp.combat(b, "batch")

    assert_equal(a.X, b.X)

    a.obs_names_make_unique()
    assert_equal(a, b)


def test_combat_single_cell_batch():
    """Test that combat raises an error when a batch has fewer than 2 cells.

    Regression test for https://github.com/scverse/scanpy/issues/1175
    """
    adata = sc.datasets.blobs()
    # Create a batch where one category has only 1 cell
    batch = pd.Categorical(["single"] + ["other"] * (adata.n_obs - 1))
    adata.obs["batch"] = batch

    with pytest.raises(ValueError, match="fewer than 2 cells"):
        sc.pp.combat(adata, key="batch")


def test_silhouette():
    # this test checks wether combat can align data from several gaussians
    # it checks this by computing the silhouette coefficient in a pca embedding

    # load in data
    adata = sc.datasets.blobs()

    # apply combat
    sc.pp.combat(adata, "blobs")

    # compute pca
    sc.pp.pca(adata)
    x_pca = adata.obsm["X_pca"]

    # compute silhouette coefficient in pca
    sh = silhouette_score(x_pca[:, :2], adata.obs["blobs"].values)

    assert sh < 0.1