-
Notifications
You must be signed in to change notification settings - Fork 61
Expand file tree
/
Copy path5.0_Summarize_Known_Transcriptome.pl
More file actions
28 lines (24 loc) · 1016 Bytes
/
5.0_Summarize_Known_Transcriptome.pl
File metadata and controls
28 lines (24 loc) · 1016 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
use strict;
use warnings;
#Things to filter:
# single exon non-reference transcripts (class 'u' 'i' & single exon),
# transcripts with retained introns (class 'e'),
# polymerase read-though (class 'p'),
# class code 's' (likely read mapping error)
# helpful info: http://seqanswers.com/forums/showthread.php?t=3518
# Stats I would like to have: (1) % reference transcripts recovered (# transcripts class '=' vs # transcripts genome, (2) # novel intergenic multi-exonic transcripts, (3) # novel alternatively spliced transcripts
if (@ARGV < 1) {die "Please provide reference GTF\n";}
my %transcriptid2lines =();
my %transcriptid2numexons = ();
open (my $ifh, $ARGV[0]) or die $!;
while (<$ifh>) {
chomp;
$_ =~ /transcript_id "(.+?)";/;
my $tid = $1;
if ($_ =~ /exon_number "(\d+)"/) {
if (!exists($transcriptid2numexons{$tid}) || $transcriptid2numexons{$tid} < $1) {
$transcriptid2numexons{$tid} = $1;
}
}
} close($ifh);
print "Number of Transcripts: ".scalar(keys(%transcriptid2numexons))."\n";