-
Notifications
You must be signed in to change notification settings - Fork 61
Expand file tree
/
Copy pathgetBlast.m
More file actions
executable file
·186 lines (172 loc) · 7.68 KB
/
Copy pathgetBlast.m
File metadata and controls
executable file
·186 lines (172 loc) · 7.68 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
function [blastStructure,blastReport]=getBlast(organismID,fastaFile,...
modelIDs,refFastaFiles,developMode,hideVerbose)
% getBlast
% Performs a bidirectional BLAST between the organism of interest and a
% set of template organisms
%
% Input:
% organismID the id of the organism of interest. This should also
% match with the id supplied to getModelFromHomology
% fastaFile a FASTA file with the protein sequences for the
% organism of interest
% modelIDs a cell array of model ids. These must match the
% "model.id" fields in the "models" structure if the
% output is to be used with getModelFromHomology
% refFastaFiles a cell array with the paths to the corresponding FASTA
% files
% developMode true if blastReport should be generated that is used
% in the unit testing function for BLAST+ (optional, default
% false)
% hideVerbose true if no status messages should be printed (optional,
% default false)
%
% Output:
% blastStructure structure containing the bidirectional homology
% measurements that can be used by getModelFromHomology
% blastReport structure containing MD5 hashes for FASTA database
% files and non-parsed BLAST output data. Will be blank
% if developMode is false.
%
% NOTE: This function calls BLAST+ to perform a bidirectional homology
% test between the organism of interest and a set of other organisms
% using standard settings. The only filtering this function does is the
% removal of hits with an E-value higher than 10e-5. The other homology
% measurements can be implemented using getBlastFromExcel.
%
% Usage: [blastStructure,blastReport]=getBlast(organismID,fastaFile,...
% modelIDs,refFastaFiles,developMode,hideVerbose)
if nargin<5
developMode = false;
end
if nargin<6
hideVerbose = false;
end
%Everything should be cell arrays
organismID=convertCharArray(organismID);
fastaFile=convertCharArray(fastaFile);
modelIDs=convertCharArray(modelIDs);
refFastaFiles=convertCharArray(refFastaFiles);
%Create blank structures for results
blastStructure=[];
blastReport.dbHashes.phr={};
blastReport.dbHashes.pot={};
blastReport.dbHashes.psq={};
blastReport.dbHashes.pto={};
blastReport.blastTxtOutput={};
%Get the directory for RAVEN Toolbox
ravenPath=findRAVENroot();
%Generate temporary names for BLAST databases and output files
tmpDB=tempname;
outFile=tempname;
if ispc && contains(tmpDB,' ')
warning(['MATLAB assigned ''%s'' as temporary file path, but it '...
'contains a space character, which is not compatible with '...
'BLAST. Instead, a temporary folder will be initiated at '...
'''C:\\tempRAVEN\\'' which should manually be removed when '...
'finished.'],tmpDB)
tmpDB=tempname('C:\tempRAVEN');
outFile=tempname('C:\tempRAVEN');
end
%Check for existence of files. If no full path is specified for a file,
%assume that it is in the current folder
if isrow(refFastaFiles)
files=horzcat(fastaFile,refFastaFiles);
else
files=vertcat(fastaFile,refFastaFiles);
end
files=checkFileExistence(files,2); %Copy files to temp dir
fastaFile = files(1);
refFastaFiles = files(2:end);
%Identify the operating system
if isunix
if ismac
binEnd='.mac';
else
binEnd='';
end
elseif ispc
binEnd='.exe';
setenv('BLASTDB_LMDB_MAP_SIZE','1000000');
else
dispEM('Unknown OS, exiting.')
return
end
%Run BLAST multi-threaded to use all logical cores assigned to MATLAB
cores = evalc('feature(''numcores'')');
cores = strsplit(cores, 'MATLAB was assigned: ');
cores = regexp(cores{2},'^\d*','match');
cores = cores{1};
%Create a database for the new organism and blast each of the refFastaFiles
%against it
[status, message]=system(['"' fullfile(ravenPath,'software','blast+',['makeblastdb' binEnd]) '" -in "' fastaFile{1} '" -out "' fullfile(tmpDB, 'tmpDB') '" -dbtype prot']);
if developMode
blastReport.dbHashes.phr{numel(blastReport.dbHashes.phr)+1}=getMD5Hash(fullfile(tmpDB, 'tmpDB.phr'));
blastReport.dbHashes.pot{numel(blastReport.dbHashes.pot)+1}=getMD5Hash(fullfile(tmpDB, 'tmpDB.pot'));
blastReport.dbHashes.psq{numel(blastReport.dbHashes.psq)+1}=getMD5Hash(fullfile(tmpDB, 'tmpDB.psq'));
blastReport.dbHashes.pto{numel(blastReport.dbHashes.pto)+1}=getMD5Hash(fullfile(tmpDB, 'tmpDB.pto'));
end
if status~=0
error('makeblastdb did not run successfully, error:\n%s',strip(message))
end
for i=1:numel(refFastaFiles)
if ~hideVerbose
fprintf(['BLASTing "' modelIDs{i} '" against "' organismID{1} '"..\n']);
end
[status, message]=system(['"' fullfile(ravenPath,'software','blast+',['blastp' binEnd]) '" -query "' refFastaFiles{i} '" -out "' outFile '_' num2str(i) '" -db "' fullfile(tmpDB, 'tmpDB') '" -evalue 10e-5 -outfmt "10 qseqid sseqid evalue pident length bitscore ppos" -num_threads "' cores '"']);
if developMode
blastReport.blastTxtOutput{numel(blastReport.blastTxtOutput)+1}=importdata([outFile '_' num2str(i)]);
end
if status~=0
error('blastp did not run successfully, error:\n%s',strip(message))
end
end
delete([tmpDB filesep 'tmpDB*']);
%Then create a database for each of the reference organisms and blast the
%new organism against them
for i=1:numel(refFastaFiles)
if ~hideVerbose
fprintf(['BLASTing "' organismID{1} '" against "' modelIDs{i} '"..\n']);
end
[status, message]=system(['"' fullfile(ravenPath,'software','blast+',['makeblastdb' binEnd]) '" -in "' refFastaFiles{i} '" -out "' fullfile(tmpDB, 'tmpDB') '" -dbtype prot']);
if status~=0
error('makeblastdb did not run successfully, error:\n%s',strip(message))
end
[status, message]=system(['"' fullfile(ravenPath,'software','blast+',['blastp' binEnd]) '" -query "' fastaFile{1} '" -out "' outFile '_r' num2str(i) '" -db "' fullfile(tmpDB, 'tmpDB') '" -evalue 10e-5 -outfmt "10 qseqid sseqid evalue pident length bitscore ppos" -num_threads "' cores '"']);
if developMode
blastReport.dbHashes.phr{numel(blastReport.dbHashes.phr)+1}=getMD5Hash(fullfile(tmpDB, 'tmpDB.phr'));
blastReport.dbHashes.pot{numel(blastReport.dbHashes.pot)+1}=getMD5Hash(fullfile(tmpDB, 'tmpDB.pot'));
blastReport.dbHashes.psq{numel(blastReport.dbHashes.psq)+1}=getMD5Hash(fullfile(tmpDB, 'tmpDB.psq'));
blastReport.dbHashes.pto{numel(blastReport.dbHashes.pto)+1}=getMD5Hash(fullfile(tmpDB, 'tmpDB.pto'));
blastReport.blastTxtOutput{numel(blastReport.blastTxtOutput)+1}=importdata([outFile '_r' num2str(i)]);
end
if status~=0
error('blastp did not run successfully, error:\n%s',strip(message))
end
delete([tmpDB filesep 'tmpDB*']);
end
%Done with the BLAST, do the parsing of the text files
for i=1:numel(refFastaFiles)*2
tempStruct=[];
if i<=numel(refFastaFiles)
tempStruct.fromId=modelIDs{i};
tempStruct.toId=organismID{1};
A=readtable([outFile '_' num2str(i)],'Delimiter',',','Format','%s%s%f%f%f%f%f');
else
tempStruct.fromId=organismID{1};
tempStruct.toId=modelIDs{i-numel(refFastaFiles)};
A=readtable([outFile '_r' num2str(i-numel(refFastaFiles))],'Delimiter',',','Format','%s%s%f%f%f%f%f');
end
tempStruct.fromGenes=A{:,1};
tempStruct.toGenes=A{:,2};
tempStruct.evalue=table2array(A(:,3));
tempStruct.identity=table2array(A(:,4));
tempStruct.aligLen=table2array(A(:,5));
tempStruct.bitscore=table2array(A(:,6));
tempStruct.ppos=table2array(A(:,7));
blastStructure=[blastStructure tempStruct];
end
%Remove the old tempfiles
delete([outFile '*']);
%Remove the temp fasta files
delete(files{:})
end