scanpy/tests/test_normalization.py at 051836482f3ae9c703112a6aa69bae40bd26fff7 · scverse/scanpy · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
from __future__ import annotations

from contextlib import nullcontext
from functools import partial
from typing import TYPE_CHECKING

import numpy as np
import pytest
from anndata import AnnData
from anndata.tests.helpers import assert_equal
from fast_array_utils import conv, stats
from scipy import sparse

import scanpy as sc
from scanpy.preprocessing._normalization import _compute_nnz_median
from testing.scanpy._helpers import (
    _check_check_values_warnings,
    check_rep_mutation,
    check_rep_results,
)

# TODO: Add support for sparse-in-dask
from testing.scanpy._pytest.params import ARRAY_TYPES, ARRAY_TYPES_DENSE

if TYPE_CHECKING:
    from collections.abc import Callable
    from typing import Any

to_ndarray = partial(conv.to_dense, to_cpu_memory=True)

X_total = np.array([[1, 0], [3, 0], [5, 6]])
X_frac = np.array([[1, 0, 1], [3, 0, 1], [5, 6, 1]])


@pytest.mark.parametrize("array_type", ARRAY_TYPES)
@pytest.mark.parametrize("dtype", ["float32", "int64"])
@pytest.mark.parametrize("target_sum", [None, 1.0], ids=["no_target_sum", "target_sum"])
@pytest.mark.parametrize(
    "exclude_highly_expressed", [True, False], ids=["excl_hi", "no_excl_hi"]
)
def test_normalize_matrix_types(
    array_type, dtype, target_sum, exclude_highly_expressed
):
    adata = sc.datasets.pbmc68k_reduced()
    adata.X = (adata.raw.X).astype(dtype)
    adata_casted = adata.copy()
    adata_casted.X = array_type(adata_casted.raw.X).astype(dtype)
    sc.pp.normalize_total(
        adata, target_sum=target_sum, exclude_highly_expressed=exclude_highly_expressed
    )
    sc.pp.normalize_total(
        adata_casted,
        target_sum=target_sum,
        exclude_highly_expressed=exclude_highly_expressed,
    )
    adata.X = conv.to_dense(adata.X)
    adata_casted.X = conv.to_dense(adata_casted.X, to_cpu_memory=True)
    np.testing.assert_allclose(adata_casted.X, adata.X, rtol=1e-5, atol=1e-5)


@pytest.mark.parametrize("array_type", ARRAY_TYPES)
@pytest.mark.parametrize("dtype", ["float32", "int64"])
def test_normalize_total(array_type, dtype):
    adata = AnnData(array_type(X_total).astype(dtype))
    sc.pp.normalize_total(adata, key_added="n_counts")
    assert np.allclose(to_ndarray(stats.sum(adata.X, axis=1)), [3.0, 3.0, 3.0])
    sc.pp.normalize_total(adata, target_sum=1, key_added="n_counts2")
    assert np.allclose(to_ndarray(stats.sum(adata.X, axis=1)), [1.0, 1.0, 1.0])

    adata = AnnData(array_type(X_frac).astype(dtype))
    sc.pp.normalize_total(adata, exclude_highly_expressed=True, max_fraction=0.7)
    assert np.allclose(to_ndarray(stats.sum(adata.X[:, 1:3], axis=1)), [1.0, 1.0, 1.0])


@pytest.mark.filterwarnings("ignore:Some cells have zero counts:UserWarning")
@pytest.mark.parametrize("array_type", ARRAY_TYPES)
@pytest.mark.parametrize("dtype", ["float32", "int64"])
def test_normalize_total_rep(array_type, dtype):
    """Test that layer/obsm kwargs work."""
    x = array_type(sparse.random(100, 50, format="csr", density=0.2, dtype=dtype))
    check_rep_mutation(sc.pp.normalize_total, x)
    check_rep_results(sc.pp.normalize_total, x)


@pytest.mark.parametrize("array_type", ARRAY_TYPES)
@pytest.mark.parametrize("dtype", ["float32", "int64"])
def test_normalize_total_view(array_type, dtype):
    adata = AnnData(array_type(X_total).astype(dtype))
    v = adata[:, :]

    with pytest.warns(UserWarning, match=r"Received a view"):
        sc.pp.normalize_total(v)
    sc.pp.normalize_total(adata)

    assert not v.is_view
    assert_equal(adata, v)


def test_normalize_pearson_residuals_warnings(pbmc3k_parametrized):
    adata = pbmc3k_parametrized()

    if np.issubdtype(adata.X.dtype, np.integer):
        pytest.skip("Can’t store non-integral data with int dtype")

    # depending on check_values, warnings should be raised for non-integer data
    adata_noninteger = adata.copy()
    x, y = np.nonzero(adata_noninteger.X)
    adata_noninteger.X[x[0], y[0]] = 0.5

    _check_check_values_warnings(
        function=sc.experimental.pp.normalize_pearson_residuals,
        adata=adata_noninteger,
        expected_warning="`normalize_pearson_residuals()` expects raw count data, but non-integers were found.",
    )


@pytest.mark.parametrize(
    ("params", "match"),
    [
        pytest.param(
            dict(theta=0), r"Pearson residuals require theta > 0", id="theta=0"
        ),
        pytest.param(
            dict(theta=-1), r"Pearson residuals require theta > 0", id="theta=-1"
        ),
        pytest.param(
            dict(clip=-1),
            r"Pearson residuals require `clip>=0` or `clip=None`.",
            id="clip=-1",
        ),
    ],
)
def test_normalize_pearson_residuals_errors(pbmc3k_parametrized, params, match):
    adata = pbmc3k_parametrized()

    with pytest.raises(ValueError, match=match):
        sc.experimental.pp.normalize_pearson_residuals(adata, **params)


@pytest.mark.parametrize(
    "sparsity_func",
    [np.array, sparse.csr_matrix],  # noqa: TID251
    ids=lambda x: x.__name__,
)
@pytest.mark.parametrize("dtype", ["float32", "int64"])
@pytest.mark.parametrize("theta", [0.01, 1, 100, np.inf])
@pytest.mark.parametrize("clip", [None, 1, np.inf])
def test_normalize_pearson_residuals_values(sparsity_func, dtype, theta, clip):
    # toy data
    x = np.array([[3, 6], [2, 4], [1, 0]])
    ns = np.sum(x, axis=1)
    ps = np.sum(x, axis=0) / np.sum(x)
    mu = np.outer(ns, ps)

    # compute reference residuals
    if np.isinf(theta):
        # Poisson case
        residuals_reference = (x - mu) / np.sqrt(mu)
    else:
        # NB case
        residuals_reference = (x - mu) / np.sqrt(mu + mu**2 / theta)

    # compute output to test
    adata = AnnData(sparsity_func(x).astype(dtype))
    output = sc.experimental.pp.normalize_pearson_residuals(
        adata, theta=theta, clip=clip, inplace=False
    )
    output_x = output["X"]
    sc.experimental.pp.normalize_pearson_residuals(
        adata, theta=theta, clip=clip, inplace=True
    )

    # check for correct new `adata.uns` keys
    assert {"pearson_residuals_normalization"} <= adata.uns.keys()
    assert {"theta", "clip", "computed_on"} <= adata.uns[
        "pearson_residuals_normalization"
    ].keys()
    # test against inplace
    np.testing.assert_array_equal(adata.X, output_x)

    if clip is None:
        # default clipping: compare to sqrt(n) threshold
        clipping_threshold = np.sqrt(adata.shape[0]).astype(np.float32)
        assert np.max(output_x) <= clipping_threshold
        assert np.min(output_x) >= -clipping_threshold
    elif np.isinf(clip):
        # no clipping: compare to raw residuals
        assert np.allclose(output_x, residuals_reference)
    else:
        # custom clipping: compare to custom threshold
        assert np.max(output_x) <= clip
        assert np.min(output_x) >= -clip


def _check_pearson_pca_fields(ad, n_cells, n_comps):
    assert {"pearson_residuals_normalization", "pca"} <= ad.uns.keys(), (
        "Missing `.uns` keys. Expected `['pearson_residuals_normalization', 'pca']`, "
        f"but only {list(ad.uns.keys())} were found"
    )
    assert "X_pca" in ad.obsm, (
        f"Missing `obsm` key `'X_pca'`, only {list(ad.obsm.keys())} were found"
    )
    assert "PCs" in ad.varm, (
        f"Missing `varm` key `'PCs'`, only {list(ad.varm.keys())} were found"
    )
    assert ad.obsm["X_pca"].shape == (
        n_cells,
        n_comps,
    ), "Wrong shape of PCA output in `X_pca`"


@pytest.mark.parametrize("n_hvgs", [100, 200])
@pytest.mark.parametrize("n_comps", [30, 50])
@pytest.mark.parametrize(
    ("do_hvg", "params", "n_var_copy_name"),
    [
        pytest.param(False, dict(), "n_genes", id="no_hvg"),
        pytest.param(True, dict(), "n_hvgs", id="hvg_default"),
        pytest.param(
            True, dict(use_highly_variable=False), "n_genes", id="hvg_opt_out"
        ),
        pytest.param(False, dict(mask_var="test_mask"), "n_unmasked", id="mask"),
    ],
)
def test_normalize_pearson_residuals_pca(
    *,
    pbmc3k_parametrized_small: Callable[[], AnnData],
    n_hvgs: int,
    n_comps: int,
    do_hvg: bool,
    params: dict[str, Any],
    n_var_copy_name: str,  # number of variables in output if inplace=False
):
    adata = pbmc3k_parametrized_small()
    n_cells, n_genes = adata.shape
    n_unmasked = n_genes - 5
    adata.var["test_mask"] = np.r_[
        np.repeat(True, n_unmasked), np.repeat(False, n_genes - n_unmasked)  # noqa: FBT003
    ]
    n_var_copy = locals()[n_var_copy_name]
    assert isinstance(n_var_copy, int | np.integer)

    if do_hvg:
        sc.experimental.pp.highly_variable_genes(
            adata, flavor="pearson_residuals", n_top_genes=n_hvgs
        )

    ctx = (
        pytest.warns(FutureWarning, match=r"use_highly_variable.*deprecated")
        if "use_highly_variable" in params
        else nullcontext()
    )
    with ctx:  # inplace=False
        adata_pca = sc.experimental.pp.normalize_pearson_residuals_pca(
            adata.copy(), inplace=False, n_comps=n_comps, **params
        )
    with ctx:  # inplace=True modifies the input adata object
        sc.experimental.pp.normalize_pearson_residuals_pca(
            adata, inplace=True, n_comps=n_comps, **params
        )

    for ad, n_var_ret in (
        (adata_pca, n_var_copy),
        # inplace adatas should always retains original shape
        (adata, n_genes),
    ):
        _check_pearson_pca_fields(ad, n_cells, n_comps)

        # check adata shape to see if all genes or only HVGs are in the returned adata
        assert ad.shape == (n_cells, n_var_ret)

        # check PC shapes to see whether or not HVGs were used for PCA
        assert ad.varm["PCs"].shape == (n_var_ret, n_comps)

    # check if there are columns of all-zeros in the PCs shapes
    # to see whether or not HVGs were used for PCA
    # either no all-zero-colums or all number corresponding to non-hvgs should exist
    assert sum(np.sum(np.abs(adata.varm["PCs"]), axis=1) == 0) == (n_genes - n_var_copy)

    # compare PCA results beteen inplace / copied
    np.testing.assert_array_equal(adata.obsm["X_pca"], adata_pca.obsm["X_pca"])


@pytest.mark.parametrize("n_hvgs", [100, 200])
@pytest.mark.parametrize("n_comps", [30, 50])
def test_normalize_pearson_residuals_recipe(
    pbmc3k_parametrized_small: Callable[[], AnnData], n_hvgs: int, n_comps: int
) -> None:
    adata = pbmc3k_parametrized_small()
    n_cells, n_genes = adata.shape

    ### inplace = False ###
    # outputs the (potentially hvg-restricted) adata_pca object
    # PCA on all genes
    adata_pca, hvg = sc.experimental.pp.recipe_pearson_residuals(
        adata.copy(), inplace=False, n_comps=n_comps, n_top_genes=n_hvgs
    )

    # check PCA fields
    _check_pearson_pca_fields(adata_pca, n_cells, n_comps)
    # check adata output shape (only HVGs in output)
    assert adata_pca.shape == (n_cells, n_hvgs)
    # check PC shape (non-hvgs are removed, so only `n_hvgs` genes)
    assert adata_pca.varm["PCs"].shape == (n_hvgs, n_comps)

    # check hvg df
    assert {
        "means",
        "variances",
        "residual_variances",
        "highly_variable_rank",
        "highly_variable",
    } <= set(hvg.columns)
    assert np.sum(hvg["highly_variable"]) == n_hvgs
    assert hvg.shape[0] == n_genes

    ### inplace = True ###
    # modifies the input adata object
    # PCA on all genes
    sc.experimental.pp.recipe_pearson_residuals(
        adata, inplace=True, n_comps=n_comps, n_top_genes=n_hvgs
    )

    # check PCA fields and output shape
    _check_pearson_pca_fields(adata, n_cells, n_comps)
    # check adata shape (no change to input)
    assert adata.shape == (n_cells, n_genes)
    # check PC shape (non-hvgs are masked with 0s, so original number of genes)
    assert adata.varm["PCs"].shape == (n_genes, n_comps)
    # number of all-zero-colums should be number of non-hvgs
    assert sum(np.sum(np.abs(adata.varm["PCs"]), axis=1) == 0) == n_genes - n_hvgs


@pytest.mark.parametrize("array_type", ARRAY_TYPES_DENSE)
@pytest.mark.parametrize("dtype", ["float32", "int64"])
def test_compute_nnz_median(array_type, dtype):
    data = np.array([0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=dtype)
    data = array_type(data)
    np.testing.assert_allclose(_compute_nnz_median(data), 5)