-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpull_out_sequences_described_in_bed_file.pl
241 lines (214 loc) · 6.34 KB
/
pull_out_sequences_described_in_bed_file.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
#!/usr/bin/env perl
# Pulls out subsets of sequences by name and position within that sequence from bed file.
# Usage:
# perl pull_out_sequences_described_in_bed_file.pl [fasta file path] [bed file path]
# [0 to output fasta sequence, 1 to add sequence as new column in bed file]
# Prints to console. To print to file, use
# perl pull_out_sequences_described_in_bed_file.pl [fasta file path] [bed file path]
# [0 to output fasta sequence, 1 to add sequence as new column in bed file]
# > [output file path]
use strict;
use warnings;
my $fasta_file = $ARGV[0]; # fasta file
my $bed_file = $ARGV[1]; # tab-separated table with columns: sequence name, first position (0-indexed), non-inclusive end position (0-indexed), optional sub-sequence name, optional score, optional strand (+ or -); sequence names must match those in fasta file
my $output_as_column_in_bed_file = $ARGV[2]; # 0 to output fasta sequence, 1 to add sequence as new column in bed file
# in bed file:
my $SEQUENCE_NAME_COLUMN = 0;
my $START_POSITION_COLUMN = 1;
my $END_POSITION_COLUMN = 2;
my $BED_FILE_SUBSEQUENCE_NAME_COLUMN = 3; # optional
my $STRAND_COLUMN = 5; # optional
my $DELIMITER = "\t";
my $NEWLINE = "\n";
# verifies that fasta file exists and is non-empty
if(!$fasta_file)
{
print STDERR "Error: no input fasta file provided. Exiting.\n";
die;
}
if(!-e $fasta_file)
{
print STDERR "Error: input fasta file does not exist:\n\t".$fasta_file."\nExiting.\n";
die;
}
if(-z $fasta_file)
{
print STDERR "Error: input fasta file is empty:\n\t".$fasta_file."\nExiting.\n";
die;
}
# verifies that input bed file exists and is non-empty
if(!$bed_file)
{
print STDERR "Error: no input bed file provided. Exiting.\n";
die;
}
if(!-e $bed_file)
{
print STDERR "Error: input bed file does not exist:\n\t".$bed_file."\nExiting.\n";
die;
}
if(-z $bed_file)
{
print STDERR "Error: input bed file is empty:\n\t".$bed_file."\nExiting.\n";
die;
}
# reads in sequence names in bed file
my %sequence_name_in_bed_file = (); # key: sequence name -> value: 1 if sequence name is in bed file
open BED_FILE, "<$bed_file" || die "Could not open $bed_file to read; terminating =(\n";
while(<BED_FILE>) # for each line in the file
{
chomp;
if($_ =~ /\S/) # non-empty line
{
my @values = split($DELIMITER, $_);
my $sequence_name = $values[$SEQUENCE_NAME_COLUMN];
$sequence_name_in_bed_file{$sequence_name} = 1;
}
}
close BED_FILE;
# reads in fasta sequences named in bed file
my %sequence_name_to_sequence = (); # key: sequence name -> value: sequence
open FASTA_FILE, "<$fasta_file" || die "Could not open $fasta_file to read; terminating =(\n";
my $current_sequence_name = "";
my $current_sequence = "";
while(<FASTA_FILE>) # for each line in the file
{
chomp;
my $line = $_;
if($line =~ /^>(.*)$/) # header line
{
# saves previous sequence if it appeared in bed file
if($current_sequence_name and $current_sequence and $sequence_name_in_bed_file{$current_sequence_name})
{
$sequence_name_to_sequence{$current_sequence_name} = $current_sequence;
}
# updates current sequence name and clears current sequence
$current_sequence_name = $1;
$current_sequence = "";
}
else # sequence, not a header line
{
# adds to current sequence
$current_sequence .= $line;
}
}
close FASTA_FILE;
# saves last sequence if it appeared in bed file
if($current_sequence_name and $current_sequence and $sequence_name_in_bed_file{$current_sequence_name})
{
$sequence_name_to_sequence{$current_sequence_name} = $current_sequence;
}
# reads in positions described in bed file and extracts the sequences at those positions
# from the fasta
open BED_FILE, "<$bed_file" || die "Could not open $bed_file to read; terminating =(\n";
while(<BED_FILE>) # for each line in the file
{
chomp;
my $line = $_;
if($line =~ /\S/) # non-empty line
{
# retrieves sequence name and start and end
my @values = split($DELIMITER, $line);
my $sequence_name = $values[$SEQUENCE_NAME_COLUMN];
my $bed_file_subsequence_name = $values[$BED_FILE_SUBSEQUENCE_NAME_COLUMN];
my $sequence_start = $values[$START_POSITION_COLUMN]; # first position (0-indexed)
my $sequence_end = $values[$END_POSITION_COLUMN]; # non-inclusive end position (0-indexed)
my $strand = $values[$STRAND_COLUMN]; # + or -
# only retrieves sequence substring if we have read in the sequence
my $sequence_substring = "";
if($sequence_name_to_sequence{$sequence_name})
{
# retrieves described subset of sequence
my $sequence = $sequence_name_to_sequence{$sequence_name};
my $length_of_substring = $sequence_end - $sequence_start;
$sequence_substring = substr($sequence, $sequence_start, $length_of_substring);
# generates reverse complement if - strand
if($strand eq "-")
{
my $sequence_substring_rc = "";
foreach my $base(split //, $sequence_substring)
{
# retrieves complement of base
my $base_rc = "";
if($base eq "A")
{
$base_rc = "T";
}
elsif($base eq "T")
{
$base_rc = "A";
}
elsif($base eq "C")
{
$base_rc = "G";
}
elsif($base eq "G")
{
$base_rc = "C";
}
elsif($base eq "N")
{
$base_rc = "N";
}
elsif($base eq "a")
{
$base_rc = "t";
}
elsif($base eq "t")
{
$base_rc = "a";
}
elsif($base eq "c")
{
$base_rc = "g";
}
elsif($base eq "g")
{
$base_rc = "c";
}
elsif($base eq "n")
{
$base_rc = "n";
}
else
{
$base_rc = $base;
print STDERR "Error: could not retrieve reverse-complement of "
."base ".$base.".\n";
}
# adds base to start of sequence
$sequence_substring_rc = $base_rc.$sequence_substring_rc;
}
$sequence_substring = $sequence_substring_rc;
}
}
else # sequence named in bed file not found in fasta
{
print STDERR "Error: sequence not found in fasta: ".$sequence_name."\n";
}
# prints output
if($output_as_column_in_bed_file) # print as extra column in bed file
{
print $line;
print $DELIMITER;
print $sequence_substring;
print $NEWLINE;
}
else # print as fasta file
{
if($bed_file_subsequence_name)
{
print ">".$bed_file_subsequence_name;
}
else
{
print ">".$sequence_name."_".$sequence_start."-".$sequence_end;
}
print $NEWLINE;
print $sequence_substring;
print $NEWLINE;
}
}
}
close BED_FILE;
# June 9, 2022