-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsplit_fasta_one_sequence_per_file.pl
158 lines (121 loc) · 3.94 KB
/
split_fasta_one_sequence_per_file.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
#!/usr/bin/env perl
# Splits fasta file into multiple files with one sequence per file. Each output file is
# named using the sequence name.
# Usage:
# perl split_fasta_one_sequence_per_file.pl [fasta file path]
# New files are created at filepath of old file with "_[sequence_name].fasta" appended to
# to the end. Files already at those paths will be overwritten.
use strict;
use warnings;
my $fasta_file = $ARGV[0];
my $OVERWRITE = 1; # set to 0 to prevent overwriting (stop script rather than overwrite)
# verifies that input fasta file exists and is not empty
if(!$fasta_file)
{
print STDERR "Error: no input fasta file provided. Exiting.\n";
die;
}
if(!-e $fasta_file)
{
print STDERR "Error: input fasta file does not exist:\n\t".$fasta_file."\nExiting.\n";
die;
}
if(-z $fasta_file)
{
print STDERR "Error: input fasta file is empty:\n\t".$fasta_file."\nExiting.\n";
die;
}
# reads in start of input fasta file to verify that we have enough sequences
my $number_sequences = 0;
open FASTA_FILE, "<$fasta_file" || die "Could not open $fasta_file to read; terminating =(\n";
while(<FASTA_FILE>) # for each line in the file
{
if($_ =~ /^>/) # header line
{
$number_sequences++;
# to avoid reading large files twice, stops reading once we have verified that
# we have two sequences
if($number_sequences >= 2)
{
close FASTA_FILE;
last;
}
}
}
close FASTA_FILE;
if($number_sequences < 2)
{
print STDERR "Fewer than 2 sequences in input file. My services are not needed here.\n";
die;
}
# splits sequences in fasta file into a number of smaller files with one sequence per file
my %sequence_name_to_number_appearances = (); # key: sequence name -> value: number of times sequence name has been seen
open FASTA_FILE, "<$fasta_file" || die "Could not open $fasta_file to read; terminating =(\n";
while(<FASTA_FILE>) # for each line in the file
{
chomp;
my $line = $_;
if($line =~ /^>(.*)$/) # header line
{
# closes current output file
close OUT_FILE;
# retrieves new sequence name
my $sequence_name = $1;
# records that we have seen this sequence
$sequence_name_to_number_appearances{$sequence_name}++;
# verifies that we have not seen this new sequence name before
if($sequence_name_to_number_appearances{$sequence_name} > 1)
{
print STDERR "Warning: sequence name ".$sequence_name." appears more than once. ";
# tries to give sequence a new name
$sequence_name .= "__name_dup".($sequence_name_to_number_appearances{$sequence_name}-1);
# if new name is also taken, adds to the end of it until it isn't
while($sequence_name_to_number_appearances{$sequence_name})
{
$sequence_name .= "_name_dup";
}
# records that we have used this name
$sequence_name_to_number_appearances{$sequence_name}++;
print STDERR "Renaming to ".$sequence_name.".\n";
}
# opens new output file
my $current_output_file = $fasta_file."_".make_safe_for_filename($sequence_name).".fasta";
if(-e $current_output_file)
{
print STDERR "Warning: output file already exists. Overwriting:\n\t"
.$current_output_file."\n";
die_if_overwrite_not_allowed();
}
open OUT_FILE, ">$current_output_file" || die "Could not open $current_output_file to write; terminating =(\n";
}
print OUT_FILE $line;
print OUT_FILE "\n";
}
close FASTA_FILE;
close OUT_FILE;
# makes string safe for use as a filename
# replaces all whitespace, |s, /s, and \s with underscores
sub make_safe_for_filename
{
my $string = $_[0];
# replaces all whitespace with _s
$string =~ s/\s/_/g;
# replaces all |s with _s
$string =~ s/\|/_/g;
# replaces all /s with _s
$string =~ s/\//_/g;
# replaces all \s with _s
$string =~ s/\\/_/g;
return $string;
}
# if overwriting not allowed (if $OVERWRITE is set to 0), prints an error and exits
sub die_if_overwrite_not_allowed
{
if(!$OVERWRITE)
{
print STDERR "Error: exiting to avoid overwriting. Set \$OVERWRITE to 1 to allow "
."overwriting.\n";
die;
}
}
# July 12, 2021