-
Notifications
You must be signed in to change notification settings - Fork 4
Expand file tree
/
Copy pathmakeCytoscapeNetwork.sh
More file actions
executable file
·257 lines (227 loc) · 8.04 KB
/
makeCytoscapeNetwork.sh
File metadata and controls
executable file
·257 lines (227 loc) · 8.04 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
#!/usr/bin/env bash
#OR 110202
#IN: assembly fasta, transcript fatsa, AGP scaffolding file
#OUT: network files with interactions and properties, (and optionally files generated in process like blat output)
#DO: build network definition
ERROR_ARGUMENTS=65
ERROR_NOFILE=66
USAGE="\nUsage: \n\
`basename $0` -g genome.fasta -t transcript.fasta -a file.agp [-chklinoqsw]\n\
`basename $0` -p blat.alignment.psl -a file.agp [-chklinoqsw]\n\n\
-a file specifying scaffolding way in agp format\n\
-c flag to build network on contig level (required if contig.fasta is submitted)\n\
-g fasta file with scaffolded genome contigs (requires -c flag) or\n\
\tfasta file with genomic scaffolds (can include unscaffolded contigs)\n\
-h print help\n\
-k flag to keep blat alignment files\n\
\tdefault: do not keep\n\
-l match length (bp, minimum transcript match length for blat output filterting)\n\
\tdefault=200 \n\
-n network.name (string to be used for out files)\n\
\tdefault=out\n\
-o overlap (bp, maximum overlap of split transcript alignments)\n\
\tdefault=50 \n\
-p transcript to genome alignment file in psl format (can be submitted instead fasta files)\n\
-q flag to keep folder with temporary files\n\
-i intron size threshold (bp)\n\
\tdefault=20000 \n\
-s maximum allowed size difference between a gap and a sequence to fill in the gap (bp, required only for scaffold level) \n\
\tdefault=4000 \n\
-t fasta file with transcript sequences\n\
-w mismatch (floating, percentage of allowed mismatch bases)\n\
\tdefault=0.05\n"
# Print usage: scriptname -options, if script invoked with no command-line args
if [ $# -eq 0 ]
then
echo -e $USAGE
exit $ERROR_ARGUMENTS
fi
SCNversion=1.0.0
# Options followed by : expect an argument
while getopts "a:g:i:l:n:o:p:s:t:w:chkqv" Option
do
case $Option in
a ) inAGP=$OPTARG;;
c ) cflag=1;;
g ) inAssembly=$OPTARG;;
h ) echo -e $USAGE; exit 1;;
i ) myMaxIntron=$OPTARG;;
k ) kflag=1;;
l ) matchLength=$OPTARG;;
n ) outPrefix=$OPTARG;;
o ) maxOverlap=$OPTARG;;
p ) inUserPSL=$OPTARG;;
q ) qflag=1;;
s ) myMaxDifference=$OPTARG;;
t ) inTranscripts=$OPTARG;;
w ) misMatch=$OPTARG;;
v ) echo version: $SCNversion; exit;;
#if no argument is supplied to the option requiring it, it will fall into default
* ) echo "Unimplemented option chosen."; exit $ERROR_ARGUMENTS;; # DEFAULT
esac
done
# Decrements the argument pointer so it points to next argument.
shift $(($OPTIND - 1))
#store start time
startTime=`echo $(date) |awk '{split($4,myS,":"); print myS[1]*360+myS[2]*60+myS[3]}'`
#### CHECKING SCRIPTS in RELATIVE SCRIPT DIRECTORY
scriptDIR="$(cd ${0%/*}; echo "$PWD")"
scriptDIR=$scriptDIR/scripts
requiredScripts=(AGP_keepOnlyScaffolds.pl
blat_filterResults.pl
blat_filterDuplicatesWithOverlap.pl
blat_buildContigNetworkForCytoscape.pl
generateScaffoldingAttributes.pl)
for myscript in ${requiredScripts[@]}
do
if [[ ! -r $scriptDIR/$myscript ]]
then
echo Cannot open $scriptDIR/$myscript
exit $ERROR_NOFILE
fi
done
#### DEFAULT VALUES
maxOverlap_DEFAULT=50
outPrefix_DEFAULT="out"
matchLength_DEFAULT=200
misMatch_DEFAULT=0.05
myMaxIntron_DEFAULT=20000
myMaxDifference_DEFAULT=4000
#### SETTING UNDEFINED OPTIONS TO DEFAULTS
: ${maxOverlap=$maxOverlap_DEFAULT}
: ${outPrefix=$outPrefix_DEFAULT}
: ${matchLength=$matchLength_DEFAULT}
: ${misMatch=$misMatch_DEFAULT}
: ${myMaxIntron=$myMaxIntron_DEFAULT}
: ${myMaxDifference=$myMaxDifference_DEFAULT}
#### SETUP A TEMPORARY WORKING DIRECTORY
currentDir=$PWD
tempDIR=temp"$(date +%Y%m%d_%H%M%S)"
mkdir $tempDIR
if [[ -z $inUserPSL ]]
then
#use symlinks in temp dir
inFILES=($inAssembly $inTranscripts)
myAssembly=${inAssembly##*/}
myTranscripts=${inTranscripts##*/}
else
inFILES=($inUserPSL)
myUserPSL=${inUserPSL##*/}
fi
#make symlinks for input files (with absoulute paths)
for onefile in ${inFILES[@]}
do
ln -s "$(cd `dirname $onefile`; echo "$PWD")"/"${onefile##*/}" $tempDIR/
done
#if AGP is input, process it to retain only scaffolds (more than 1 element features)
if [[ ! -z $inAGP ]]
then
echo Retrieving scaffold features from AGP
perl $scriptDIR/AGP_keepOnlyScaffolds.pl $inAGP > $tempDIR/scaffold_only.agp
myAGP=scaffold_only.agp
fi
#go to RUN directory
cd $tempDIR
##### STEP1: RUN BLAT
if [[ -z $inUserPSL ]]
then
if [[ ! -x `which blat` ]]; then
echo Cannot find blat or is not an executable
exit $ERROR_NOFILE
else
myPSL=$outPrefix.psl
BLATcall="blat $myAssembly $myTranscripts $myPSL"
echo Aligning: $BLATcall
echo `$BLATcall`
fi
else
myPSL=$myUserPSL
fi
##### STEP2: FILTER BLAT OUTPUT (PSL FORMAT)
echo Filtering psl
mytag=l"$matchLength"f0m"$misMatch"g"$myMaxIntron"e0x30b1t0
awk 'FNR>5' "$myPSL" | perl $scriptDIR/blat_filterResults.pl -l $matchLength -e 0 -m $misMatch -g "$myMaxIntron" > "$outPrefix"_"$mytag".psl
sort -k10,10 -k12,12n "$outPrefix"_"$mytag".psl | perl $scriptDIR/blat_filterDuplicatesWithOverlap.pl -o $maxOverlap > "$outPrefix"_filtered.psl
##### STEP 3 generate alignment statistics
if [[ -z "$cflag" ]]
then
mytag=scaffold
else
mytag=contig
fi
myOutFile=$outPrefix.$mytag.blat.stat
printf "RunName\t"$outPrefix"\n" > $myOutFile
printf "unfilteredMatches\t" >> $myOutFile
awk 'FNR>5{print $10}' "$myPSL" | sort | uniq | wc -l >> $myOutFile
printf "Matches_max_mismatch_$misMatch\t" >> $myOutFile
mytag=l"$matchLength"f0m"$misMatch"g"$myMaxIntron"e0x30b1t0
cut -f10 "$outPrefix"_"$mytag".psl | sort | uniq | wc -l >> $myOutFile
myFraction=(0.50 0.90 0.95 0.99)
for oneFraction in ${myFraction[@]}
do
printf $oneFraction"LengthMatches\t" >> $myOutFile
mytag=l"$matchLength"f"$oneFraction"m"$misMatch"g"$myMaxIntron"e0x30b1t0
awk 'FNR>5' "$myPSL" | perl $scriptDIR/blat_filterResults.pl -l $matchLength -f $oneFraction -e 0 -m $misMatch -g "$myMaxIntron" > "$outPrefix"_"$mytag".psl
cut -f10 "$outPrefix"_"$mytag".psl | sort | uniq | wc -l >> $myOutFile
done
###### STEP4 generate cytoscape network
#Swith between contig and scaffold level
unset -v mytag
if [[ -z "$cflag" ]]
then
##### SCAFFOLD-LEVEL
# can process (reduced level) without agp
echo Processing on scaffold-level
bash $scriptDIR/processOnScaffoldLevel.sh $outPrefix $scriptDIR $myMaxIntron $myMaxDifference $myAGP
myOutFiles=("$outPrefix".scaffold.nw "$outPrefix".scaffold.attr "$outPrefix".withinOneScaffold $outPrefix.scaffold.blat.stat)
mytag=scaffold
else
#### CONTIG LEVEL
# agp file is required
echo Processing on contig-level
bash $scriptDIR/processOnContigLevel.sh $outPrefix $myAGP $scriptDIR
myOutFiles=("$outPrefix".contig.nw "$outPrefix".contig.attr $outPrefix.contig.blat.stat)
mytag=contig
fi
mv ${myOutFiles[@]} ..
#print options and in/out files used/generated
echo -e "\nRUN SUMMARY\n\
Input_files: ${inFILES[@]}\n\
Output_directory: $currentDir"
printf "Output_files: "
printf "%s " "${myOutFiles[@]}"
#remove alignment files, unless -k flag was set
if [[ ! -z "$kflag" ]]
then
outname="$outPrefix"_filtered."$mytag".psl
mv "$outPrefix"_filtered.psl ../"$outname"
printf "%s " "$outname"
if [[ -z "$inUserPSL" ]]
then
outname="$outPrefix"."$mytag".psl
mv "$outPrefix".psl ../"$outname"
printf "%s " "$outname"
fi
fi
cd ..
if [[ -z "$qflag" ]]
then
rm -r $tempDIR
else
outname="$outPrefix"_"$mytag"_"$tempDIR"
mv $tempDIR "$outname"
printf "\nTemporary_directory: $outname"
fi
#continue printing
echo -e "\n\
Options: maximum overlap between aligned transcript parts = $maxOverlap bp\n\
minimum match length = $matchLength bp\n\
maximum mismatch fraction of alignment = $misMatch\n\
intron threshold = $myMaxIntron bp"
if [[ -z "$cflag" ]]
then
echo -e " maximum difference between gap and scaffold to fit in = $myMaxDifference"
fi
endTime=`echo $(date) |awk '{split($4,myS,":"); print myS[1]*360+myS[2]*60+myS[3]}'`
printf $endTime"\t"$startTime | awk '{myTime=($1-$2)/60; print "Processing time: "myTime" minutes."}'
exit 0