forked from saezlab/decoupler-py
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmethod_udt.py
115 lines (90 loc) · 3.61 KB
/
method_udt.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
"""
Method UDT.
Code to run the Univariate Decision Tree (UDT) method.
"""
import numpy as np
from scipy.sparse import csr_matrix
import pandas as pd
from .pre import extract, match, rename_net, get_net_mat, filt_min_n, return_data
from tqdm.auto import tqdm
def check_if_sklearn():
try:
import sklearn as sk
import sklearn.tree
except Exception:
raise ImportError('sklearn is not installed. Please install it with: pip install scikit-learn')
return sk
def fit_dt(sk, regulator, sample, min_leaf=5, seed=42):
# Fit DT
x, y = regulator.reshape(-1, 1), sample.reshape(-1, 1)
regr = sk.tree.DecisionTreeRegressor(min_samples_leaf=min_leaf, random_state=seed)
regr.fit(x, y, check_input=False)
# Get importance
return regr.tree_.compute_feature_importances(normalize=False)[0]
def udt(mat, net, min_leaf=5, seed=42, verbose=False):
# Check if sklearn is installed
sk = check_if_sklearn()
# Init empty acts
acts = np.zeros((mat.shape[0], net.shape[1]))
# For each sample and regulator fit dt
for i in tqdm(range(mat.shape[0]), disable=not verbose):
if isinstance(mat, csr_matrix):
sample = mat[i].toarray()[0]
else:
sample = mat[i]
for j in range(net.shape[1]):
acts[i, j] = fit_dt(sk, net[:, j], sample, min_leaf=min_leaf, seed=seed)
return acts
def run_udt(mat, net, source='source', target='target', weight='weight', min_leaf=5, min_n=5, seed=42, verbose=False,
use_raw=True):
"""
Univariate Decision Tree (UDT).
UDT fits a single regression decision tree for each sample and regulator, where the observed molecular readouts in `mat`
are the response variable and the regulator weights in `net` are the explanatory one. Target features with no associated
weight are set to zero. The obtained feature importance from the fitted model is the activity (`udt_estimate`) of a given
regulator.
Parameters
----------
mat : list, DataFrame or AnnData
List of [features, matrix], dataframe (samples x features) or an AnnData instance.
net : DataFrame
Network in long format.
source : str
Column name in net with source nodes.
target : str
Column name in net with target nodes.
weight : str
Column name in net with weights.
min_leaf : int
The minimum number of samples required to be at a leaf node.
min_n : int
Minimum of targets per source. If less, sources are removed.
seed : int
Random seed to use.
verbose : bool
Whether to show progress.
use_raw : bool
Use raw attribute of mat if present.
Returns
-------
estimate : DataFrame
UDT scores. Stored in `.obsm['udt_estimate']` if `mat` is AnnData.
pvals : DataFrame
Obtained p-values. Stored in `.obsm['udt_pvals']` if `mat` is AnnData.
"""
# Extract sparse matrix and array of genes
m, r, c = extract(mat, use_raw=use_raw, verbose=verbose)
# Transform net
net = rename_net(net, source=source, target=target, weight=weight)
net = filt_min_n(c, net, min_n=min_n)
sources, targets, net = get_net_mat(net)
# Match arrays
net = match(c, targets, net)
if verbose:
print('Running udt on mat with {0} samples and {1} targets for {2} sources.'.format(m.shape[0], len(c), net.shape[1]))
# Run UDT
estimate = udt(m, net, min_leaf=min_leaf, seed=seed, verbose=verbose)
# Transform to df
estimate = pd.DataFrame(estimate, index=r, columns=sources)
estimate.name = 'udt_estimate'
return return_data(mat=mat, results=(estimate, ))