-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmergeSNPcov.pl
executable file
·106 lines (96 loc) · 2.64 KB
/
mergeSNPcov.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
#!/usr/bin/perl -w
#
# mergeSNPcov.pl
#
# merge SNP file and coverage file information
# Replace Xs in SNP file with Ref base in appropriate positions
# Remove SNPs with fewer than 3 individuals present
#
# September 25, 2013
# Liz Cooper
use strict;
# Get the SNP file, the coverage file, and the output file as command line arguments
# Get the number of individuals as an input argument
my ($USAGE) = "\n$0 <input.genotypes> <input.cov> <output.snps> <num_individuals>
\tinput.genotypes = The sorted genotypes file from filterSNPs.pl
\tinput.cov = The merged and sorted coverage file from cleanup_cov.pl
\toutput.snps = The name of the final output file with SNPs for all individuals
\tnum_individuals = The total number of individuals that will be in the output\n\n";
unless (@ARGV) {
print $USAGE;
exit;
}
my ($snpfile, $covfile, $outfile, $num_indivs) = @ARGV;
open (SNP, $snpfile) || die "\nUnable to open the file $snpfile!\n";
open (COV, $covfile) || die "\nUnable to open the file $covfile!\n";
open (OUT, ">$outfile") || die "\nUnable to open the file $outfile!\n";
while (<SNP>) {
chomp $_;
my ($posID, $ref, $alt, $genstring) = split(/\s{1,}/, $_);
#print $posID, "\n";
my @gens = split(/,/, $genstring);
if ((scalar @gens) != $num_indivs) {
next;
} else {
my $flag = 1;
my $line = '';
LOOP: while ($flag) {
unless (eof(COV)) {
$line = <COV>;
chomp $line;
my @temp = split (/\s{1,}/, $line);
if ($temp[0] eq $posID) {
$flag = 0;
last LOOP;
} else {
$flag++;
}
}
}
my @cov_line = split(/\s{1,}/, $line);
my @pops = @cov_line[1..(scalar @cov_line - 1)];
my @lengths = ();
foreach my $p (@pops) {
my @temp_p = split(/,/,$p);
push (@lengths, (scalar @temp_p));
}
my $long_string = join(",", @pops);
my @covs = split(/,/, $long_string);
for (my $i = 0; $i < scalar @gens; $i++) {
my $g = $gens[$i];
if ($g =~ /X/) {
my $cov = $covs[$i];
if ($cov > 0) {
if (length $ref > 1) {
$gens[$i] = '0';
} else {
$gens[$i] = $ref;
}
} else {
$gens[$i] = 'N';
}
}
}
my $new_string = join('', @gens);
#print $posID, "\t", $new_string, "\n";
my $temp_string = $new_string;
$temp_string =~ s/N//g;
if (length $temp_string < 5) {
next;
} else {
my @new_pops = ();
my $prev = 0;
foreach my $len (@lengths) {
my $new_pop = substr($new_string, $prev, $len);
push (@new_pops, $new_pop);
$prev += $len;
}
my $new_out_line = join("\t", @new_pops);
print OUT $posID, "\t", $ref, "\t", $alt, "\t", $new_out_line, "\n";
}
}
}
close(SNP);
close(COV);
close(OUT);
exit;