forked from saezlab/decoupler-py
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmethod_wsum.py
175 lines (140 loc) · 5.84 KB
/
method_wsum.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
"""
Method WSUM.
Code to run the Weighted sum (WSUM) method.
"""
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from .pre import extract, match, rename_net, get_net_mat, filt_min_n, return_data
from .method_gsea import std
from tqdm.auto import tqdm
import numba as nb
@nb.njit(nb.types.UniTuple(nb.f4[:, :], 3)(nb.f4[:, :], nb.f4[:, :], nb.f4[:, :], nb.i8[:], nb.i8, nb.i8), cache=True)
def run_perm(estimate, mat, net, idxs, times, seed):
mat = np.ascontiguousarray(mat)
np.random.seed(seed)
# Init null distirbution
null_dst = np.zeros((mat.shape[0], net.shape[1], times), dtype=nb.f4)
pvals = np.zeros((mat.shape[0], net.shape[1]), dtype=nb.f4)
# Permute
for i in nb.prange(times):
np.random.shuffle(idxs)
null_dst[:, :, i] = mat.dot(net[idxs])
pvals += np.abs(null_dst[:, :, i]) > np.abs(estimate)
# Compute empirical p-value
pvals = np.where(pvals == 0.0, 1.0, pvals).astype(nb.f4)
pvals = np.where(pvals == times, times-1, pvals).astype(nb.f4)
pvals = pvals / times
pvals = np.where(pvals >= 0.5, 1-(pvals), pvals)
pvals = pvals * 2
# Compute z-score
norm = np.zeros((mat.shape[0], net.shape[1]), dtype=nb.f4)
for i in nb.prange(mat.shape[0]):
for j in range(net.shape[1]):
e = estimate[i, j]
d = std(null_dst[i, j, :], 1)
if d != 0.:
n = (e - np.mean(null_dst[i, j, :])) / d
else:
if e != 0.:
n = np.inf
else:
n = 0.
norm[i, j] = n
# Compute corr score
corr = (estimate * -np.log10(pvals)).astype(nb.f4)
return norm, corr, pvals
def wsum(mat, net, times, batch_size, seed, verbose):
# Get dims
n_samples = mat.shape[0]
n_features, n_fsets = net.shape
# Init empty acts
estimate = np.zeros((n_samples, n_fsets), dtype=np.float32)
if times > 1:
norm = np.zeros((n_samples, n_fsets), dtype=np.float32)
corr = np.zeros((n_samples, n_fsets), dtype=np.float32)
pvals = np.zeros((n_samples, n_fsets), dtype=np.float32)
idxs = np.arange(n_features, dtype=np.int64)
else:
norm, corr, pvals = None, None, None
if isinstance(mat, csr_matrix):
n_batches = int(np.ceil(n_samples / batch_size))
for i in tqdm(range(n_batches), disable=not verbose):
# Subset batch
srt, end = i * batch_size, i * batch_size + batch_size
tmp = mat[srt:end].toarray()
# Run WSUM
estimate[srt:end] = tmp.dot(net)
if times > 1:
norm[srt:end], corr[srt:end], pvals[srt:end] = run_perm(estimate[srt:end], tmp, net, idxs, times, seed)
else:
estimate = mat.dot(net)
if times > 1:
norm, corr, pvals = run_perm(estimate, mat, net, idxs, times, seed)
return estimate, norm, corr, pvals
def run_wsum(mat, net, source='source', target='target', weight='weight', times=1000, batch_size=10000, min_n=5, seed=42,
verbose=False, use_raw=True):
"""
Weighted sum (WSUM).
WSUM infers regulator activities by first multiplying each target feature by its associated weight which then are summed
to an enrichment score (`wsum_estimate`). Furthermore, permutations of random target features can be performed to obtain a
null distribution that can be used to compute a z-score (`wsum_norm`), or a corrected estimate (`wsum_corr`) by multiplying
`wsum_estimate` by the minus log10 of the obtained empirical p-value.
Parameters
----------
mat : list, DataFrame or AnnData
List of [features, matrix], dataframe (samples x features) or an AnnData instance.
net : DataFrame
Network in long format.
source : str
Column name in net with source nodes.
target : str
Column name in net with target nodes.
weight : str
Column name in net with weights.
times : int
How many random permutations to do.
batch_size : int
Size of the batches to use. Increasing this will consume more memmory but it will run faster.
min_n : int
Minimum of targets per source. If less, sources are removed.
seed : int
Random seed to use.
verbose : bool
Whether to show progress.
use_raw : bool
Use raw attribute of mat if present.
Returns
-------
estimate : DataFrame
WSUM scores. Stored in `.obsm['wsum_estimate']` if `mat` is AnnData.
norm : DataFrame
Normalized WSUM scores. Stored in `.obsm['wsum_norm']` if `mat` is AnnData.
corr : DataFrame
Corrected WSUM scores. Stored in `.obsm['wsum_corr']` if `mat` is AnnData.
pvals : DataFrame
Obtained p-values. Stored in `.obsm['wsum_pvals']` if `mat` is AnnData.
"""
# Extract sparse matrix and array of genes
m, r, c = extract(mat, use_raw=use_raw, verbose=verbose)
# Transform net
net = rename_net(net, source=source, target=target, weight=weight)
net = filt_min_n(c, net, min_n=min_n)
sources, targets, net = get_net_mat(net)
# Match arrays
net = match(c, targets, net)
if verbose:
print('Running wsum on mat with {0} samples and {1} targets for {2} sources.'.format(m.shape[0], len(c), net.shape[1]))
# Run WSUM
estimate, norm, corr, pvals = wsum(m, net, times, batch_size, seed, verbose)
# Transform to df
estimate = pd.DataFrame(estimate, index=r, columns=sources)
estimate.name = 'wsum_estimate'
if pvals is not None:
norm = pd.DataFrame(norm, index=r, columns=sources)
norm.name = 'wsum_norm'
corr = pd.DataFrame(corr, index=r, columns=sources)
corr.name = 'wsum_corr'
pvals = pd.DataFrame(pvals, index=r, columns=sources)
pvals.name = 'wsum_pvals'
return return_data(mat=mat, results=(estimate, norm, corr, pvals))