-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathgfa_edge_to_node.pl
More file actions
executable file
·155 lines (131 loc) · 4.02 KB
/
gfa_edge_to_node.pl
File metadata and controls
executable file
·155 lines (131 loc) · 4.02 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
#!/usr/bin/perl -w
# FAILS as an experiment. It needs full graph traversal to do this.
# Reads a GFA file with edge and node weights and applies edge transformations
# to split node weights into plus and minus direction.
#
# We read edge EC:i node SC:f, and add node sp:f and sm:f tags
use strict;
# Parse GFA
my %seq; # node sequence
my %node; # node GFA line (S)
my @edge; # edge GFA line (L)
my $edge_num=0;
my %edge_in; # index into @edge above
my %edge_out; # index into @edge above
my %inv_edge_in;
my %inv_edge_out;
local $"="\t";
while (<>) {
chomp();
if (/^S\s+(\S+)/) {
$node{$1} = $_;
my @N = split("\t", $_);
$seq{$N[1]} = $N[2];
} elsif (/^L\s+(\S+)\s+(.)\s+(\S+)\s+(.)/) {
$edge[$edge_num] = $_;
push(@{$edge_out{$1}}, $edge_num);
push(@{$edge_in{$3}}, $edge_num);
push(@{$inv_edge_out{$3}}, $edge_num);
push(@{$inv_edge_in{$1}}, $edge_num);
$edge_num++;
}
}
# Count routes into a node via edges and emit the modified GFA
# TODO: confirm that this is sensible.
# "Fixing overall parity" but only taking the versions present in the .gfa and not the inverses.
# e.g. s1 + s2 + should also imply s2 - s1 -.
# Counter example??
# Node Weight
# s0 2
# s1 2
# s2 1
# s3 1
# s4 1
# Edges
# s0 + s1 + 2
# s1 + s2 + 1
# s2 + s4 + 1
# s1 + s3 + 1
# s4 + s3 - 1
# Paths
# s0+ -> s1+ -> s2+ -> s4+ -> s3- -> s1- -> s0-
# s0+ -> s1+ -> s3+ -> s4- -> s2- -> s1- -> s0-
# In either case, we have weight on s0- & s1- which is not counted below. The orientation of s2 & s3 is ambiguous, but only s3 has ambiguous weighting.
foreach my $n (sort keys %node) {
print "$n \n";
my ($SC) = $node{$n} =~ m/SC:[if]:(\d+(\.\d+)?)/;
my %dc=qw/+ 0 - 0/;
foreach my $e (@{$edge_in{$n}}) {
print "edge in num: $e\n";
my ($d1,$d2,$EC) = $edge[$e] =~ m/.*([+-]).*([+-]).*EC:i:(\d+)/;
$dc{$d2}+=$EC;
}
foreach my $e (@{$edge_out{$n}}) {
print "edge out num: $e\n";
my ($d1,$d2,$EC) = $edge[$e] =~ m/.*([+-]).*([+-]).*EC:i:(\d+)/;
$dc{$d1}+=$EC;
}
# foreach my $e (@{$inv_edge_in{$n}}) {
# print "inv edge in num: $e\n";
# my ($d1,$d2,$EC) = $edge[$e] =~ m/.*([+-]).*([+-]).*EC:i:(\d+)/;
# if ($d1 =~ m/\+/) {
# $dc{"-"}+=$EC;
# } else {
# $dc{"+"}+=$EC;
# }
# }
# foreach my $e (@{$inv_edge_out{$n}}) {
# print "inv edge out num: $e\n";
# my ($d1,$d2,$EC) = $edge[$e] =~ m/.*([+-]).*([+-]).*EC:i:(\d+)/;
# if ($d2 =~ m/\+/) {
# $dc{"-"}+=$EC;
# } else {
# $dc{"+"}+=$EC;
# }
# }
my $dn=$dc{"+"}+$dc{"-"};
print "dn: $dn \n";
print "dc+: ", $dc{"+"}, "\n";
print "dc-: ", $dc{"-"}, "\n";
print "\n";
$dc{"+"}=($dc{"+"}+0)/($dn+1) * $SC;
$dc{"-"}=($dc{"-"}+0)/($dn+1) * $SC;
printf("%-10s %5.1f %5.1f %5.1f\n",
$n, $SC, $dc{"+"}, $dc{"-"});
# print "$node{$n}\tsp:f:",$dc{"+"},"\tsm:f:",$dc{"-"},"\n";
}
# foreach my $e (@edge) {
# print "$e","\n"
# }
__END__;
/tmp/tangle/illumina_km_00103/seq_1091-0027-#1#1.gfa
[ 21] s42+
[ 22] s18+
[ 23] s43-
[ 24] s21-
[ 25] s20-
[ 26] s19-
[ 27] s44+
[ 28] s23+
L s16 + s42 + SR:i:1 L1:i:233 L2:i:56 EC:i:41
L s17 + s18 + SR:i:0 L1:i:1 L2:i:24 EC:i:0
L s18 + s19 + SR:i:0 L1:i:24 L2:i:20 EC:i:0
L s18 + s43 - SR:i:4 L1:i:24 L2:i:12 EC:i:0
L s19 + s20 + SR:i:0 L1:i:20 L2:i:76 EC:i:16
L s19 + s49 + SR:i:3 L1:i:20 L2:i:76 EC:i:0
L s19 - s44 + SR:i:4 L1:i:20 L2:i:45 EC:i:27
L s20 + s21 + SR:i:0 L1:i:76 L2:i:120 EC:i:26
L s21 + s22 + SR:i:0 L1:i:120 L2:i:168 EC:i:0
L s21 + s43 + SR:i:1 L1:i:120 L2:i:12 EC:i:0
L s22 + s23 + SR:i:0 L1:i:168 L2:i:905 EC:i:2
L s23 + s24 + SR:i:0 L1:i:905 L2:i:492 EC:i:85
L s23 + s45 + SR:i:1 L1:i:905 L2:i:565 EC:i:0
L s42 + s18 + SR:i:1 L1:i:56 L2:i:24 EC:i:47
L s43 + s44 + SR:i:1 L1:i:12 L2:i:45 EC:i:0
L s44 + s23 + SR:i:1 L1:i:45 L2:i:905 EC:i:69
L s49 + s21 + SR:i:3 L1:i:76 L2:i:120 EC:i:0
+ + - - - - + +
So +42+ +18+ +43- +21+ +20+ +19+ -44+ +23+
SR:i rank
L1:i arc_len, length of first seq
L2:i arc_lw, length of second seq