1
+ function [Exp ,RegNet ,TFCoop ,TFNames ,GeneNames ]=processData(exp_file ,motif_file ,ppi_file ,modeProcess )
2
+ % Description:
3
+ % processData process the input data before running PANDA in
4
+ % three different modes.
5
+ %
6
+ % Inputs:
7
+ % exp_file : path to file containing gene expression as a matrix of size (g,g)
8
+ % motif_file: path to file containing the prior TF-gene regulatory network based on TF motifs as a matrix of size (t,g)
9
+ % ppi_file : path to file containing TF-TF interaction graph as a matrix of size (t,t)
10
+ % modeProcess: 'legacy' old deprecated behavior of netZooM <= 0.5
11
+ % (Default)'union' fills missing genes and TFs with zero rows
12
+ % 'intersection' removes missing genes and TFs
13
+ % Outputs:
14
+ % Exp : aligned expression matrix
15
+ % RegNet : aligned motif prior matrix
16
+ % TFCoop : aligned PPI matrix
17
+ % TFNames : transcirption factor names
18
+ % GeneNames : gene names
19
+ %
20
+ % Author: Marouen Ben Guebila 12/2019
21
+ if isequal(modeProcess ,' legacy' )
22
+ disp(' Reading in expression data!' );
23
+ tic
24
+ fid = fopen(exp_file , ' r' );
25
+ headings = fgetl(fid );
26
+ n = length(regexp(headings , ' \t ' ));
27
+ frewind(fid );
28
+ % Exp = textscan(fid, ['%s', repmat('%f', 1, n)], 'delimiter', '\t', 'CommentStyle', '#');
29
+ Exp = textscan(fid , [' %s ' , repmat(' %f ' , 1 , n )], ' delimiter' , ' \t ' ); % tiny speed-up by not checking for comments
30
+ fclose(fid );
31
+ GeneNames = Exp{1 };
32
+ Exp = cat(2 , Exp{2 : end });
33
+ [NumGenes , NumConditions ] = size(Exp );
34
+ fprintf(' %d genes and %d conditions!\n ' , NumGenes , NumConditions );
35
+ Exp = Exp ' ; % transpose expression matrix from gene-by-sample to sample-by-gene
36
+ toc
37
+
38
+ disp(' Reading in motif data!' );
39
+ tic
40
+ [TF , gene , weight ] = textread(motif_file , ' %s%s%f ' );
41
+ TFNames = unique(TF );
42
+ NumTFs = length(TFNames );
43
+ [~ ,i ] = ismember(TF , TFNames );
44
+ [~ ,j ] = ismember(gene , GeneNames );
45
+ RegNet = zeros(NumTFs , NumGenes );
46
+ RegNet(sub2ind([NumTFs , NumGenes ], i , j )) = weight ;
47
+ fprintf(' %d TFs and %d edges!\n ' , NumTFs , length(weight ));
48
+ toc
49
+
50
+ disp(' Reading in ppi data!' );
51
+ tic
52
+ TFCoop = eye(NumTFs );
53
+ if (~isempty(ppi_file ))
54
+ [TF1 , TF2 , weight ] = textread(ppi_file , ' %s%s%f ' );
55
+ [~ ,i ] = ismember(TF1 , TFNames );
56
+ [~ ,j ] = ismember(TF2 , TFNames );
57
+ TFCoop(sub2ind([NumTFs , NumTFs ], i , j )) = weight ;
58
+ TFCoop(sub2ind([NumTFs , NumTFs ], j , i )) = weight ;
59
+ fprintf(' %d PPIs!\n ' , length(weight ));
60
+ end
61
+ toc
62
+ elseif isequal(modeProcess ,' union' )
63
+ [GeneMotif ,GeneNamesExp ,TfMotif ,TFNamesInit ,NumConditions ,...
64
+ ExpInit ,TF ,gene ,weightMotif ,weightPPI ,TF1 ,TF2 ]=...
65
+ readData(exp_file ,motif_file ,ppi_file );
66
+ GeneNames= unique(union(GeneMotif ,GeneNamesExp ));
67
+ TFNames = unique(union(TfMotif ,TFNamesInit ));
68
+ [Exp ,RegNet ,TFCoop ]=populateData(GeneNames ,TFNames ,NumConditions ,...
69
+ GeneNamesExp ,ExpInit ,TF ,gene ,weightMotif ,weightPPI ,TF1 ,TF2 );
70
+ elseif isequal(modeProcess ,' intersection' )
71
+ [GeneMotif ,GeneNamesExp ,TfMotif ,TFNamesInit ,NumConditions ,...
72
+ ExpInit ,TF ,gene ,weightMotif ,weightPPI ,TF1 ,TF2 ]=...
73
+ readData(exp_file ,motif_file ,ppi_file );
74
+ GeneNames= intersect(GeneMotif ,GeneNamesExp );
75
+ TFNames = intersect(TfMotif ,TFNamesInit );
76
+ [Exp ,RegNet ,TFCoop ]=populateData(GeneNames ,TFNames ,NumConditions ,...
77
+ GeneNamesExp ,ExpInit ,TF ,gene ,weightMotif ,weightPPI ,TF1 ,TF2 );
78
+ end
79
+ end
80
+
81
+ function [Exp ,RegNet ,TFCoop ]=populateData(GeneNames ,TFNames ,NumConditions ,...
82
+ GeneNamesExp ,ExpInit ,TF ,gene ,weightMotif ,weightPPI ,TF1 ,TF2 )
83
+ NumTFs= length(TFNames );NumGenes= length(GeneNames );
84
+ % Initialize result
85
+ RegNet = zeros(NumTFs ,NumGenes );
86
+ Exp = zeros(NumGenes ,NumConditions );
87
+ TFCoop = zeros(NumTFs ,NumTFs );
88
+ % Populate result
89
+ % Gene expression
90
+ [~ ,ig ]= ismember(GeneNamesExp ,GeneNames );
91
+ Exp(find(ig ),: )= ExpInit(find(ig ),: );
92
+ Exp = Exp ' ;
93
+ % Motif
94
+ [~ ,i ] = ismember(TF , TFNames );
95
+ [~ ,j ] = ismember(gene , GeneNames );
96
+ indCommMotif = i & j ;
97
+ i = i(indCommMotif );
98
+ j = j(indCommMotif );
99
+ weightMotif = weightMotif(indCommMotif );
100
+ RegNet(sub2ind([NumTFs , NumGenes ], i , j )) = weightMotif ;
101
+ fprintf(' %d TFs and %d edges!\n ' , NumTFs , length(weightMotif ));
102
+ % PPI
103
+ [~ ,i ] = ismember(TF1 , TFNames );
104
+ [~ ,j ] = ismember(TF2 , TFNames );
105
+ indCommPPI = i & j ;
106
+ i = i(indCommPPI );
107
+ j = j(indCommPPI );
108
+ weightPPI = weightPPI(indCommPPI );
109
+ TFCoop(sub2ind([NumTFs , NumTFs ], i , j )) = weightPPI ;
110
+ TFCoop(sub2ind([NumTFs , NumTFs ], j , i )) = weightPPI ;
111
+ end
112
+
113
+ function [GeneMotif ,GeneNamesExp ,TfMotif ,TFNamesInit ,NumConditions ,...
114
+ ExpInit ,TF ,gene ,weightMotif ,weightPPI ,TF1 ,TF2 ]=...
115
+ readData(exp_file ,motif_file ,ppi_file )
116
+ % Read expression
117
+ disp(' Reading in expression data!' );
118
+ tic
119
+ fid = fopen(exp_file , ' r' );
120
+ headings = fgetl(fid );
121
+ n = length(regexp(headings , ' \t ' ));
122
+ frewind(fid );
123
+ ExpInit = textscan(fid , [' %s ' , repmat(' %f ' , 1 , n )], ' delimiter' , ' \t ' ); % tiny speed-up by not checking for comments
124
+ fclose(fid );
125
+ GeneNamesExp = ExpInit{1 };
126
+ ExpInit = cat(2 , ExpInit{2 : end });
127
+ [~ , NumConditions ] = size(ExpInit );
128
+ toc
129
+ if length(unique(GeneNamesExp )) ~= length(GeneNamesExp )
130
+ error(' There are duplicate genes in the expression matrix.' )
131
+ end
132
+ % Read motif
133
+ disp(' Reading in motif data!' );
134
+ [TF , gene , weightMotif ] = textread(motif_file , ' %s%s%f ' );
135
+ TfMotif = unique(TF );
136
+ GeneMotif= unique(gene );
137
+ % Read PPI
138
+ disp(' Reading in ppi data!' );
139
+ if (~isempty(ppi_file ))
140
+ [TF1 , TF2 , weightPPI ] = textread(ppi_file , ' %s%s%f ' );
141
+ end
142
+ TFNamesInit= unique(TF1 );
143
+ if ~isequal(TFNamesInit ,unique(TF2 ))
144
+ error(' PPI data has missing information.' )
145
+ end
146
+ end
0 commit comments