-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathga2path.pl
More file actions
executable file
·133 lines (119 loc) · 2.93 KB
/
ga2path.pl
File metadata and controls
executable file
·133 lines (119 loc) · 2.93 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
#!/usr/bin/perl -w
# Process a GraphAligner GAF file to build the most common path
# Usage: ga2path.pl input.gaf
use strict;
# Load GFA paths
my @paths; # total list of paths
my %nodes; # node list; redundant
my %in; # list of incoming nodes per node
my %out; # list of outgoing nodes per node
while (<>) {
chomp($_);
my @F=split("\t", $_);
push(@paths, $F[5]);
foreach ($F[5] =~ m/[<>]([^<>]+)/g) {
$nodes{$_}++;
}
}
loop:
my $loop = 0;
print "=== NODES ===\n";
foreach (sort keys %nodes) {
print "$_ $nodes{$_}\n";
}
print "\n";
# FIXME: Basically we can do sequence assembly on the node structures.
# Eg ,-->u15->.
# / `v
# u3-->u13-->u16-->u17-->u7
# ^ /
# `<--u14<--'
#
# vvv vvv
# So u3 u13 U15 u17 u14 u13 U16 u17 u7
# or u3 u13 U16 u17 u14 u13 U15 u17 u7
# ^^^ ^^^
#
# Look for paths u3 u13 u16, u3 u13 u15, u14 u13 u13 and u14 u13 u16.
# We disambiguate the circuit then.
#
# 1. Identify triplets (IN, NODE, OUT)
# 2. If <NODE then swap IN/OUT around so order aware
# 3. Skip if *,NODE,* is just 1 route. (Ie A->B->C chain)
# 4. Foreach IN->NODE
# Replace NODE: IN->NODE->{OUT,...} with IN->IN:NODE->{OUT}
# where {OUT} is a reduced OUT list based only those from IN.
# ie.
#
# I P I--I:A--P
# \_A_/ => (or I-I:A-Q, J-J:A-P)
# / \
# J Q J--J:A--Q
# Foreach GFA node, get the predecessor.
my %prefix = ();
foreach (@paths) {
my $last1 = "";
my $last2 = "";
# FIXME: match things in @nodes only
foreach (m/[<>][^<>]+/g) {
my ($dir,$node) = m/(.)(.*)/;
# last1 is the middle entry.
# if ($last1 ne "") {
# my $t = $_;
# my $s = $last1;
# $t=~tr/<>//d;
# $s=~tr/<>//d;
# if ($_ =~ /</) {
# my $x = $t;
# $t = $s;
# $s = $x;
# }
# $in{$t}{$s}++;
# $out{$s}{$t}++;
# }
if ($last2 ne "") {
my $prev = "";
my $next = "";
if ($last1 =~ />/) {
$prev = $last2;
$next = $_;
} else {
$prev = $_;
$prev =~ tr/<>/></;
$next = $last2;
$next =~ tr/<>/></;
}
my $p = $last1;
$p =~ tr/<>//d;
$prefix{$p}{$prev}++ if ($prev ne "");
if ($prev ne "") {print "PREV: $prev $last1 $next\n"}
}
$last2 = $last1;
$last1 = $_;
}
}
#print "=== EDGES ===\n";
#foreach (sort keys(%in)) {
# foreach my $e (sort keys(%{$in{$_}})) {
# print "IN $e $_\n";
# }
# foreach my $e (sort keys(%{$out{$_}})) {
# print "OUT $_ $e\n";
# }
#}
#print "\n";
foreach (sort keys(%prefix)) {
my @a = sort {$prefix{$_}{$a} <=> $prefix{$_}{$b}} (keys(%{$prefix{$_}}));
if (scalar(@a) == 1) {
# Collapse neighbouring nodes together and remove the old one
$nodes{"$_$a[0]"}++;
delete $nodes{$_};
$loop = 1;
}
print ":", scalar(@a),"\n";
foreach my $sub (@a) {
print "$_ $sub = $prefix{$_}{$sub}\n";
}
#print "$_ $a[0]/$prefix{$_}{$a[0]} $a[-1]/$prefix{$_}{$a[-1]}\n";
}
goto loop if ($loop);