forked from saezlab/decoupler-py
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmethod_mlm.py
133 lines (104 loc) · 4.31 KB
/
method_mlm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
"""
Method MLM.
Code to run the Multivariate Linear Model (MLM) method.
"""
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from .pre import extract, match, rename_net, get_net_mat, filt_min_n, return_data
from scipy import stats
from tqdm.auto import tqdm
import numba as nb
@nb.njit(nb.f4[:, :](nb.f4[:, :], nb.f4[:, :], nb.f4[:, :], nb.i8), parallel=True, cache=True)
def fit_mlm(X, y, inv, df):
X = np.ascontiguousarray(X)
n_samples = y.shape[1]
n_fsets = X.shape[1]
coef, sse, _, _ = np.linalg.lstsq(X, y)
if len(sse) == 0:
raise ValueError("""Couldn\'t fit a multivariate linear model. This can happen because there are more sources
(covariates) than unique targets (samples), or because the network\'s matrix rank is smaller than the number of
sources.""")
sse = sse / df
se = np.zeros((n_samples, n_fsets), dtype=nb.f4)
for i in nb.prange(n_samples):
se[i] = np.sqrt(np.diag(sse[i] * inv))
t = coef.T/se
return t.astype(nb.f4)
def mlm(mat, net, batch_size=10000, verbose=False):
# Get dims
n_samples = mat.shape[0]
n_features, n_fsets = net.shape
# Add intercept to network
net = np.column_stack((np.ones((n_features, ), dtype=np.float32), net))
# Compute inv and df for lm
inv = np.linalg.inv(np.dot(net.T, net))
df = n_features - n_fsets - 1
if isinstance(mat, csr_matrix):
# Init empty acts
n_batches = int(np.ceil(n_samples / batch_size))
es = np.zeros((n_samples, n_fsets), dtype=np.float32)
for i in tqdm(range(n_batches), disable=not verbose):
# Subset batch
srt, end = i * batch_size, i * batch_size + batch_size
y = mat[srt:end].toarray().T
# Compute MLM for batch
es[srt:end] = fit_mlm(net, y, inv, df)[:, 1:]
else:
# Compute MLM for all
es = fit_mlm(net, mat.T, inv, df)[:, 1:]
# Get p-values
pvals = 2 * (1 - stats.t.cdf(np.abs(es), df))
return es, pvals
def run_mlm(mat, net, source='source', target='target', weight='weight', batch_size=10000,
min_n=5, verbose=False, use_raw=True):
"""
Multivariate Linear Model (MLM).
MLM fits a multivariate linear model for each sample, where the observed molecular readouts in `mat` are the response
variable and the regulator weights in `net` are the covariates. Target features with no associated weight are set to
zero. The obtained t-values from the fitted model are the activities (`mlm_estimate`) of the regulators in `net`.
Parameters
----------
mat : list, DataFrame or AnnData
List of [features, matrix], dataframe (samples x features) or an AnnData instance.
net : DataFrame
Network in long format.
source : str
Column name in net with source nodes.
target : str
Column name in net with target nodes.
weight : str
Column name in net with weights.
batch_size : int
Size of the samples to use for each batch. Increasing this will consume more memmory but it will run faster.
min_n : int
Minimum of targets per source. If less, sources are removed.
verbose : bool
Whether to show progress.
use_raw : bool
Use raw attribute of mat if present.
Returns
-------
estimate : DataFrame
MLM scores. Stored in `.obsm['mlm_estimate']` if `mat` is AnnData.
pvals : DataFrame
Obtained p-values. Stored in `.obsm['mlm_pvals']` if `mat` is AnnData.
"""
# Extract sparse matrix and array of genes
m, r, c = extract(mat, use_raw=use_raw, verbose=verbose)
# Transform net
net = rename_net(net, source=source, target=target, weight=weight)
net = filt_min_n(c, net, min_n=min_n)
sources, targets, net = get_net_mat(net)
# Match arrays
net = match(c, targets, net)
if verbose:
print('Running mlm on mat with {0} samples and {1} targets for {2} sources.'.format(m.shape[0], len(c), net.shape[1]))
# Run MLM
estimate, pvals = mlm(m, net, batch_size=batch_size, verbose=verbose)
# Transform to df
estimate = pd.DataFrame(estimate, index=r, columns=sources)
estimate.name = 'mlm_estimate'
pvals = pd.DataFrame(pvals, index=r, columns=sources)
pvals.name = 'mlm_pvals'
return return_data(mat=mat, results=(estimate, pvals))