forked from saezlab/decoupler-py
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmethod_mdt.py
119 lines (92 loc) · 3.75 KB
/
method_mdt.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
"""
Method MDT.
Code to run the Multivariate Decision Tree (MDT) method.
"""
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from .pre import extract, match, rename_net, get_net_mat, filt_min_n, return_data
from tqdm.auto import tqdm
def check_if_skranger():
try:
from sklearn import ensemble
except Exception:
raise ImportError('sklearn is not installed. Please install it with: pip install scikit-learn')
return ensemble
def fit_rf(sr, net, sample, trees=100, min_leaf=5, n_jobs=-1, seed=42):
# Fit Random Forest
regr = sr.RandomForestRegressor(n_estimators=trees, min_samples_leaf=min_leaf, n_jobs=n_jobs,
random_state=seed)
regr.fit(net, sample)
# Extract importances
return regr.feature_importances_
def mdt(mat, net, trees=100, min_leaf=5, n_jobs=4, seed=42, verbose=False):
# Check if skranger is installed
sr = check_if_skranger()
# Init empty acts
acts = np.zeros((mat.shape[0], net.shape[1]), dtype=np.float32)
# For each sample
for i in tqdm(range(mat.shape[0]), disable=not verbose):
if isinstance(mat, csr_matrix):
sample = mat[i].toarray()[0]
else:
sample = mat[i]
acts[i] = fit_rf(sr, net, sample, trees=trees, min_leaf=min_leaf, n_jobs=n_jobs, seed=seed)
return acts
def run_mdt(mat, net, source='source', target='target', weight='weight', trees=100, min_leaf=5, n_jobs=-1, min_n=5, seed=42,
verbose=False, use_raw=True):
"""
Multivariate Decision Tree (MDT).
MDT fits a multivariate regression random forest for each sample, where the observed molecular readouts in `mat` are the
response variable and the regulator weights in `net` are the covariates. Target features with no associated weight are set
to zero. The obtained feature importances from the fitted model are the activities (`mdt_estimate`) of the regulators in
`net`.
Parameters
----------
mat : list, DataFrame or AnnData
List of [features, matrix], dataframe (samples x features) or an AnnData instance.
net : DataFrame
Network in long format.
source : str
Column name in net with source nodes.
target : str
Column name in net with target nodes.
weight : str
Column name in net with weights.
trees : int
Number of trees in the forest.
min_leaf : int
The minimum number of samples required to be at a leaf node.
n_jobs : int
Number of jobs to run in parallel
min_n : int
Minimum of targets per source. If less, sources are removed.
seed : int
Random seed to use.
verbose : bool
Whether to show progress.
use_raw : bool
Use raw attribute of mat if present.
Returns
-------
estimate : DataFrame
MDT scores. Stored in `.obsm['mdt_estimate']` if `mat` is AnnData.
pvals : DataFrame
Obtained p-values. Stored in `.obsm['mdt_pvals']` if `mat` is AnnData.
"""
# Extract sparse matrix and array of genes
m, r, c = extract(mat, use_raw=use_raw, verbose=verbose)
# Transform net
net = rename_net(net, source=source, target=target, weight=weight)
net = filt_min_n(c, net, min_n=min_n)
sources, targets, net = get_net_mat(net)
# Match arrays
net = match(c, targets, net)
if verbose:
print('Running mdt on mat with {0} samples and {1} targets for {2} sources.'.format(m.shape[0], len(c), net.shape[1]))
# Run MDT
estimate = mdt(m, net, trees=trees, min_leaf=min_leaf, n_jobs=n_jobs, seed=seed, verbose=verbose)
# Transform to df
estimate = pd.DataFrame(estimate, index=r, columns=sources)
estimate.name = 'mdt_estimate'
return return_data(mat=mat, results=(estimate, ))