Skip to content

Commit 0f209ea

Browse files
authored
Add online coexpression (#360)
* adding coexpression to lioness and cleaning the loops to remove repeated code * fixed bug in online coexpression. Helped by Marouen * adding tests for online coexpression * fixing request for numpy lower than 2.0 * checking if macos-latest works * removing macos-latest --------- Co-authored-by: violafanfani <[email protected]>
1 parent 5e4f680 commit 0f209ea

File tree

4 files changed

+121
-70
lines changed

4 files changed

+121
-70
lines changed

netZooPy/lioness/lioness.py

+92-62
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import numpy as np
55
import pandas as pd
66
from .timer import Timer
7+
from .onlineCoexpression import onlineCoexpression
78
from joblib.externals.loky import set_loky_pickler
89
from joblib import parallel_backend
910
from joblib import Parallel, delayed
@@ -67,6 +68,8 @@ class Lioness(Panda):
6768
samples as column name
6869
ignore_final: bool
6970
if True, no lioness network is kept in memory. This requires saving single networks at each step
71+
online_coexpression: bool
72+
if True, each LIONESS correlation is computed using the online coexpression method.
7073
Returns
7174
--------
7275
export_lioness_results : _
@@ -127,6 +130,7 @@ def __init__(
127130
save_single = False,
128131
export_filename = None,
129132
ignore_final=False,
133+
online_coexpression=False,
130134
):
131135
""" Initialize instance of Lioness class and load data.
132136
"""
@@ -168,6 +172,7 @@ def __init__(
168172
self.gene_names = gene_names
169173
self.tf_names = tf_names
170174
del obj
175+
self.online_coexpression = online_coexpression
171176

172177
# Get sample range to iterate
173178
# the number of conditions is the N parameter used for the number of samples in the whole background
@@ -217,9 +222,33 @@ def __init__(
217222
# We create the folder
218223
if not os.path.exists(save_dir):
219224
os.makedirs(save_dir)
225+
226+
# if we are using online_coexpression
227+
if self.online_coexpression:
228+
self.mi_all = np.mean(self.expression_matrix, axis=1)
229+
# Here we need to specify the ddof=1 to get the estimator that matches corrcoef
230+
self.std_all = np.std(self.expression_matrix, axis=1, ddof = 1)
231+
self.cov_all = np.cov(self.expression_matrix)
232+
# We need to set the diagonal to 1 if na and 0 elsewhere
233+
# Get indices of the matrix
234+
rows, cols = np.indices(self.cov_all.shape)
235+
# Create mask for NaNs
236+
nan_mask = np.isnan(self.cov_all)
237+
# Replace NaNs based on diagonal condition
238+
self.cov_all[nan_mask & (rows == cols)] = 1 # Diagonal NaNs → 1
239+
self.cov_all[nan_mask & (rows != cols)] = 0 # Off-diagonal NaNs → 0
240+
241+
# we need n_conditions as number of samples
242+
assert(self.n_conditions>0)
243+
244+
245+
246+
220247
#############
221248
# Run LIONESS
222249
#############
250+
251+
223252
if int(self.n_conditions) >= int(self.n_cores) and self.computing == "cpu":
224253
# the total_lioness_network here is a list of 1d
225254
# arrays (network:(tfXgene,1),gene_targeting:(gene,1),tf_targeting:(tf,1))
@@ -231,6 +260,11 @@ def __init__(
231260
for i in self.indexes:
232261
self.total_lioness_network = self.__lioness_loop(i)
233262
self.total_lioness_network = self.total_lioness_network.T
263+
264+
#############
265+
# OUTPUT ####
266+
#############
267+
234268
# create result data frame
235269
if self.ignore_final:
236270
print('WARNING: we do not keep all lionesses in memory. They have been saved singularly.')
@@ -279,34 +313,47 @@ def __init__(
279313
else:
280314
self.save_lioness_results()
281315

282-
def __lioness_loop(self, i):
283-
#TODO: this is now for GPU only in practice
284-
""" Initialize instance of Lioness class and load data.
316+
def __compute_subset_panda(self,i):
317+
""" Compute the subset panda network using the correlation matrix and the motif matrix.
285318
319+
Parameters
320+
----------
321+
correlation_matrix: array
322+
The coexpression network to be used for computing the subset panda network.
323+
286324
Returns
287-
--------
288-
self.total_lioness_network: array
289-
An edge-by-sample matrix containing sample-specific networks.
325+
-------
326+
subset_panda_network: array
327+
The subset panda network.
290328
"""
291-
# for i in self.indexes:
292-
print("Running LIONESS for sample %d/%d:" %((i),(self.n_conditions)))
293329
idx = [x for x in range(self.n_conditions) if x != i] # all samples except i
294-
with Timer("Computing coexpression network:"):
295-
if self.computing == "gpu":
296-
import cupy as cp
297-
298-
correlation_matrix_cp = cp.corrcoef(self.expression_matrix[:, idx].astype(self.np_dtype)).astype(self.np_dtype)
299-
if cp.isnan(correlation_matrix_cp).any():
300-
cp.fill_diagonal(correlation_matrix_cp, 1)
301-
correlation_matrix_cp = cp.nan_to_num(correlation_matrix_cp)
302-
correlation_matrix = cp.asnumpy(correlation_matrix_cp)
303-
del correlation_matrix_cp
304-
cp._default_memory_pool.free_all_blocks()
305-
else:
306-
correlation_matrix = np.corrcoef(self.expression_matrix[:, idx])
307-
if np.isnan(correlation_matrix).any():
308-
np.fill_diagonal(correlation_matrix, 1)
309-
correlation_matrix = np.nan_to_num(correlation_matrix)
330+
331+
if self.online_coexpression:
332+
print("Computing online coexpression")
333+
correlation_matrix = onlineCoexpression(
334+
self.expression_matrix[:, i],
335+
self.n_conditions,
336+
self.mi_all,
337+
self.std_all,
338+
self.cov_all,
339+
)
340+
else:
341+
with Timer("Computing coexpression network:"):
342+
if self.computing == "gpu":
343+
import cupy as cp
344+
345+
correlation_matrix_cp = cp.corrcoef(self.expression_matrix[:, idx].astype(self.np_dtype)).astype(self.np_dtype)
346+
if cp.isnan(correlation_matrix_cp).any():
347+
cp.fill_diagonal(correlation_matrix_cp, 1)
348+
correlation_matrix_cp = cp.nan_to_num(correlation_matrix_cp)
349+
correlation_matrix = cp.asnumpy(correlation_matrix_cp)
350+
del correlation_matrix_cp
351+
cp._default_memory_pool.free_all_blocks()
352+
else:
353+
correlation_matrix = np.corrcoef(self.expression_matrix[:, idx])
354+
if np.isnan(correlation_matrix).any():
355+
np.fill_diagonal(correlation_matrix, 1)
356+
correlation_matrix = np.nan_to_num(correlation_matrix)
310357

311358
with Timer("Normalizing networks:"):
312359
correlation_matrix_orig = (
@@ -328,6 +375,23 @@ def __lioness_loop(self, i):
328375
del correlation_matrix
329376
subset_panda_network = correlation_matrix_orig
330377

378+
return subset_panda_network
379+
380+
def __lioness_loop(self, i):
381+
#TODO: this is now for GPU only in practice
382+
""" Initialize instance of Lioness class and load data.
383+
384+
Returns
385+
--------
386+
self.total_lioness_network: array
387+
An edge-by-sample matrix containing sample-specific networks.
388+
"""
389+
# for i in self.indexes:
390+
print("Running LIONESS for sample %d/%d:" %((i),(self.n_conditions)))
391+
392+
# get subset panda network
393+
subset_panda_network = self.__compute_subset_panda(i)
394+
331395
# For consistency with R, we are using the N panda_all - (N-1) panda_all_but_q
332396
lioness_network = (self.n_conditions * self.network) - (
333397
(self.n_conditions - 1) * subset_panda_network
@@ -367,7 +431,7 @@ def __lioness_loop(self, i):
367431

368432
@delayed
369433
@wrap_non_picklable_objects
370-
def __par_lioness_loop(self, i, output):
434+
def __par_lioness_loop(self, i, output, online_coexpression=None):
371435
""" Initialize instance of Lioness class and load data.
372436
373437
Returns
@@ -376,44 +440,10 @@ def __par_lioness_loop(self, i, output):
376440
An edge-by-sample matrix containing sample-specific networks.
377441
"""
378442
# for i in self.indexes:
379-
print("Running LIONESS for sample %d:" % (i + 1))
380-
idx = [x for x in range(self.n_conditions) if x != i] # all samples except i
381-
with Timer("Computing coexpression network:"):
382-
if self.computing == "gpu":
383-
import cupy as cp
384-
385-
correlation_matrix = cp.corrcoef(self.expression_matrix[:, idx])
386-
if cp.isnan(correlation_matrix).any():
387-
cp.fill_diagonal(correlation_matrix, 1)
388-
correlation_matrix = cp.nan_to_num(correlation_matrix)
389-
correlation_matrix = cp.asnumpy(correlation_matrix)
390-
else:
391-
# run on CPU with numpy
392-
correlation_matrix = np.corrcoef(self.expression_matrix[:, idx])
393-
if np.isnan(correlation_matrix).any():
394-
np.fill_diagonal(correlation_matrix, 1)
395-
correlation_matrix = np.nan_to_num(correlation_matrix)
396-
397-
with Timer("Normalizing networks:"):
398-
correlation_matrix_orig = (
399-
correlation_matrix # save matrix before normalization
400-
)
401-
correlation_matrix = self._normalize_network(correlation_matrix)
443+
print("Running LIONESS for sample %d/%d:" %((i),(self.n_conditions)))
402444

403-
with Timer("Inferring LIONESS network:"):
404-
# TODO: fix this correlation matrix+delete
405-
if self.motif_matrix is not None:
406-
del correlation_matrix_orig
407-
subset_panda_network = compute_panda(
408-
correlation_matrix,
409-
np.copy(self.ppi_matrix),
410-
np.copy(self.motif_matrix),
411-
computing = self.computing,
412-
alpha = self.alpha,
413-
)
414-
else:
415-
del correlation_matrix
416-
subset_panda_network = correlation_matrix_orig
445+
# get subset panda network
446+
subset_panda_network = self.__compute_subset_panda(i)
417447

418448
# For consistency with R, we are using the N panda_all - (N-1) panda_all_but_q
419449
#lioness_network = self.n_conditions * (self.network - subset_panda_network) + subset_panda_network

netZooPy/lioness/onlineCoexpression.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -29,10 +29,10 @@ def onlineCoexpression(si,n,mi,std,cov):
2929
# Then we compute the new std online using the orthogonality trick
3030
newstd = np.sqrt((np.square(std) - (1/n) * np.square(si - newm)) * ((n-1)/(n-2)))
3131
# Then we compute the new covariance online
32-
onCov= (1/(n-2)) * ( (cov*(n-1)) - ( (n/(n-1)) * (np.matmul((si-mi).T,(si-mi))) ) )
32+
onCov= (1/(n-2)) * ( (cov*(n-1)) - ( (n/(n-1)) * (np.outer((si-mi).T,(si-mi))) ) )
3333
# Finally, we derive the new coexpression online
34-
onCoex= onCov / np.matmul(newstd.T,newstd)
35-
# We set the diagonal explicitly to avoid numerical stability
34+
onCoex= onCov / np.outer(newstd.T,newstd)
35+
# We set the diagonal explicitly to avoid numerical instability
3636
np.fill_diagonal(onCoex, 1)
3737

3838
return onCoex

setup.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
install_requires=[
1515
'h5py',
1616
'pandas',
17-
'numpy>=1.19.2',
17+
'numpy>=1.19.2,<2.0',
1818
'networkx>=2.6.3',
1919
'matplotlib>=3.3.4',
2020
'scipy>=1.5.3',

tests/test_lioness.py

+25-4
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,22 @@ def test_lioness():
2727
output_results_mat = "lioness_output/toy-lioness-res.mat"
2828
toy_r_file = 'tests/lioness/toy-lioness-first4-1.txt'
2929

30+
31+
# Check that online coexpression is the same.
32+
# Here we are also using some of the newer processing options
33+
print('\n\nTest online coexpression\n\n')
34+
panda_obj = Panda(expression_data, motif, ppi,computing='cpu',precision='single',save_tmp=False,save_memory = False, remove_missing=True, keep_expression_matrix = True, modeProcess = 'intersection')
35+
36+
lioness_obj = Lioness(panda_obj,computing='cpu', save_dir='lioness_dir', save_single = True, export_filename=None, save_fmt='h5', output = 'network', subset_numbers=[1,2,10])
37+
38+
# Test with online coexpression
39+
lioness_obj_online = Lioness(panda_obj,computing='cpu', save_dir='lioness_dir', save_single = True, export_filename=None, save_fmt='h5', output = 'network', subset_numbers=[1,2,10], online_coexpression=True)
40+
41+
print('\n\nActual test coexpression\n\n')
42+
print(np.array_equal(lioness_obj.total_lioness_network, lioness_obj_online.total_lioness_network))
43+
assert np.array_equal(lioness_obj.total_lioness_network, lioness_obj_online.total_lioness_network)
44+
45+
3046
panda_obj = Panda(
3147
expression_data,
3248
motif,
@@ -62,8 +78,11 @@ def test_lioness():
6278
np.allclose(panda_obj.motif_matrix, lioness_obj.motif_matrix)
6379
np.allclose(panda_obj.ppi_matrix, lioness_obj.ppi_matrix)
6480

81+
82+
83+
6584
# Compare with R
66-
print("Compare with R")
85+
print("\n\nCompare with R")
6786
rdf = pd.read_csv(toy_r_file, sep = ' ', header=None, )
6887
pydf = pd.read_csv(output_table, sep = ' ').iloc[:,0:3]
6988

@@ -107,12 +126,13 @@ def test_lioness():
107126
gt = np.load("lioness_output_cmd/lioness.1.0.npy")
108127
assert np.allclose(gt, res)
109128

129+
print("\n\nTest lioness with no motif")
110130
# 2. Testing Lioness with motif set to None to compute Lioness on coexpression networks
111-
motif = None
131+
motif_none = None
112132
# Make sure to keep epxression matrix for next step
113133
panda_obj_2 = Panda(
114134
expression_data,
115-
motif,
135+
motif_none,
116136
ppi,
117137
save_tmp=True,
118138
remove_missing=rm_missing,
@@ -128,6 +148,7 @@ def test_lioness():
128148
assert np.allclose(gt, res)
129149

130150
print('test3')
151+
print("\n\nTest lioness with no motif")
131152
# 3. Testing Lioness in parallel
132153
# Set parameters
133154
os.remove("lioness_output/lioness.1.0.npy")
@@ -144,4 +165,4 @@ def test_lioness():
144165

145166
# 5. #TODO: add tf targeting score
146167

147-
#6. #TODO: add gene targeting score
168+
#6. #TODO: add gene targeting score

0 commit comments

Comments
 (0)