-
Notifications
You must be signed in to change notification settings - Fork 742
Expand file tree
/
Copy pathtest_preprocessing_distributed.py
More file actions
140 lines (107 loc) · 4.67 KB
/
test_preprocessing_distributed.py
File metadata and controls
140 lines (107 loc) · 4.67 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
from __future__ import annotations
import warnings
from pathlib import Path
from typing import TYPE_CHECKING
import numpy.testing as npt
import pytest
from anndata import OldFormatWarning, read_zarr
from scanpy._compat import DaskArray
from scanpy.preprocessing import (
filter_cells,
filter_genes,
log1p,
normalize_per_cell,
normalize_total,
)
from scanpy.preprocessing._distributed import materialize_as_ndarray
from testing.scanpy._pytest.marks import needs
if TYPE_CHECKING:
from anndata import AnnData
HERE = Path(__file__).parent / Path("_data/")
input_file = Path(HERE, "10x-10k-subset.zarr")
pytestmark = [needs.zarr]
@pytest.fixture
def adata() -> AnnData:
with warnings.catch_warnings():
warnings.filterwarnings("ignore", category=OldFormatWarning)
warnings.filterwarnings("ignore", r"Variable names are not unique", UserWarning)
a = read_zarr(input_file)
a.var_names_make_unique()
a.X = a.X[:] # convert to numpy array
return a
@pytest.fixture
def adata_dist() -> AnnData:
import dask.array as da
# regular anndata except for X, which we replace farther down
with warnings.catch_warnings():
warnings.filterwarnings("ignore", category=OldFormatWarning)
warnings.filterwarnings("ignore", r"Variable names are not unique", UserWarning)
a = read_zarr(input_file)
a.var_names_make_unique()
input_file_x = f"{input_file}/X"
a.X = da.from_zarr(input_file_x)
return a
def test_log1p(adata: AnnData, adata_dist: AnnData):
log1p(adata_dist)
assert isinstance(adata_dist.X, DaskArray)
result = materialize_as_ndarray(adata_dist.X)
log1p(adata)
assert result.shape == adata.shape
npt.assert_allclose(result, adata.X)
@pytest.mark.filterwarnings("ignore:.*sc.pp.normalize_total:FutureWarning")
def test_normalize_per_cell(
request: pytest.FixtureRequest, adata: AnnData, adata_dist: AnnData
):
if isinstance(adata_dist.X, DaskArray):
reason = "normalize_per_cell deprecated and broken for Dask"
request.applymarker(pytest.mark.xfail(reason=reason))
normalize_per_cell(adata_dist)
assert isinstance(adata_dist.X, DaskArray)
result = materialize_as_ndarray(adata_dist.X)
normalize_per_cell(adata)
assert result.shape == adata.shape
npt.assert_allclose(result, adata.X)
@pytest.mark.filterwarnings("ignore:Some cells have zero counts:UserWarning")
def test_normalize_total(adata: AnnData, adata_dist: AnnData) -> None:
normalize_total(adata_dist)
assert isinstance(adata_dist.X, DaskArray)
result = materialize_as_ndarray(adata_dist.X)
normalize_total(adata)
assert result.shape == adata.shape
npt.assert_allclose(result, adata.X)
def test_filter_cells_array(adata: AnnData, adata_dist: AnnData):
cell_subset_dist, number_per_cell_dist = filter_cells(adata_dist.X, min_genes=3)
assert isinstance(cell_subset_dist, DaskArray)
assert isinstance(number_per_cell_dist, DaskArray)
cell_subset, number_per_cell = filter_cells(adata.X, min_genes=3)
npt.assert_allclose(materialize_as_ndarray(cell_subset_dist), cell_subset)
npt.assert_allclose(materialize_as_ndarray(number_per_cell_dist), number_per_cell)
def test_filter_cells(adata: AnnData, adata_dist: AnnData):
filter_cells(adata_dist, min_genes=3)
assert isinstance(adata_dist.X, DaskArray)
result = materialize_as_ndarray(adata_dist.X)
filter_cells(adata, min_genes=3)
assert result.shape == adata.shape
npt.assert_array_equal(adata_dist.obs["n_genes"], adata.obs["n_genes"])
npt.assert_allclose(result, adata.X)
def test_filter_genes_array(adata: AnnData, adata_dist: AnnData):
gene_subset_dist, number_per_gene_dist = filter_genes(adata_dist.X, min_cells=2)
assert isinstance(gene_subset_dist, DaskArray)
assert isinstance(number_per_gene_dist, DaskArray)
gene_subset, number_per_gene = filter_genes(adata.X, min_cells=2)
npt.assert_allclose(materialize_as_ndarray(gene_subset_dist), gene_subset)
npt.assert_allclose(materialize_as_ndarray(number_per_gene_dist), number_per_gene)
def test_filter_genes(adata: AnnData, adata_dist: AnnData):
filter_genes(adata_dist, min_cells=2)
assert isinstance(adata_dist.X, DaskArray)
result = materialize_as_ndarray(adata_dist.X)
filter_genes(adata, min_cells=2)
assert result.shape == adata.shape
npt.assert_allclose(result, adata.X)
def test_write_zarr(adata: AnnData, adata_dist: AnnData, tmp_path: Path) -> None:
log1p(adata_dist)
assert isinstance(adata_dist.X, DaskArray)
adata_dist.write_zarr(tmp_path / "test.zarr")
adata_log1p = read_zarr(tmp_path / "test.zarr")
log1p(adata)
npt.assert_allclose(adata_log1p.X, adata.X)