1+ #!/usr/bin/env python3
2+ """Quick PAF statistics for comparing filtering effects"""
3+
4+ import sys
5+ from collections import defaultdict
6+
7+ def get_paf_stats (filename ):
8+ """Compute statistics for a PAF file"""
9+
10+ total_mappings = 0
11+ total_bases = 0
12+ inter_chromosomal = 0
13+ inter_genome = 0
14+ self_mappings = 0
15+
16+ # Track coverage by genome pair
17+ genome_pair_bases = defaultdict (int )
18+ chr_pair_mappings = defaultdict (int )
19+ genome_sizes = {}
20+
21+ with open (filename , 'r' ) as f :
22+ for line in f :
23+ fields = line .strip ().split ('\t ' )
24+ if len (fields ) < 12 :
25+ continue
26+
27+ query = fields [0 ]
28+ query_len = int (fields [1 ])
29+ query_start = int (fields [2 ])
30+ query_end = int (fields [3 ])
31+ target = fields [5 ]
32+ target_len = int (fields [6 ])
33+
34+ total_mappings += 1
35+ mapping_len = query_end - query_start
36+ total_bases += mapping_len
37+
38+ # Extract genome and chromosome names
39+ if '#' in query :
40+ q_parts = query .split ('#' )
41+ q_genome = '#' .join (q_parts [:2 ]) if len (q_parts ) >= 2 else query
42+ q_chr = q_parts [2 ] if len (q_parts ) >= 3 else query
43+ else :
44+ q_genome = query
45+ q_chr = query
46+
47+ if '#' in target :
48+ t_parts = target .split ('#' )
49+ t_genome = '#' .join (t_parts [:2 ]) if len (t_parts ) >= 2 else target
50+ t_chr = t_parts [2 ] if len (t_parts ) >= 3 else target
51+ else :
52+ t_genome = target
53+ t_chr = target
54+
55+ # Track genome sizes
56+ genome_sizes [query ] = query_len
57+ genome_sizes [target ] = target_len
58+
59+ # Count mapping types
60+ if query == target :
61+ self_mappings += 1
62+ elif q_genome != t_genome :
63+ inter_genome += 1
64+ genome_pair_bases [(q_genome , t_genome )] += mapping_len
65+ elif q_chr != t_chr :
66+ inter_chromosomal += 1
67+
68+ # Track chromosome pairs
69+ chr_pair_mappings [(query , target )] += 1
70+
71+ # Calculate genome sizes
72+ genome_totals = defaultdict (int )
73+ for seq_name , size in genome_sizes .items ():
74+ if '#' in seq_name :
75+ genome = '#' .join (seq_name .split ('#' )[:2 ])
76+ else :
77+ genome = seq_name
78+ genome_totals [genome ] += size
79+
80+ # Calculate coverage for genome pairs
81+ genome_coverages = []
82+ for (q_genome , t_genome ), bases in genome_pair_bases .items ():
83+ if q_genome in genome_totals :
84+ coverage = 100.0 * bases / genome_totals [q_genome ]
85+ genome_coverages .append (coverage )
86+
87+ avg_coverage = sum (genome_coverages ) / len (genome_coverages ) if genome_coverages else 0
88+
89+ return {
90+ 'total_mappings' : total_mappings ,
91+ 'total_bases' : total_bases ,
92+ 'self_mappings' : self_mappings ,
93+ 'inter_chromosomal' : inter_chromosomal ,
94+ 'inter_genome' : inter_genome ,
95+ 'chr_pairs' : len (chr_pair_mappings ),
96+ 'genome_pairs' : len (genome_pair_bases ),
97+ 'avg_coverage' : avg_coverage ,
98+ 'coverages_above_95' : sum (1 for c in genome_coverages if c > 95 ),
99+ 'total_genome_pairs' : len (genome_coverages ),
100+ }
101+
102+ def compare_paf_files (file1 , file2 ):
103+ """Compare two PAF files and show differences"""
104+ stats1 = get_paf_stats (file1 )
105+ stats2 = get_paf_stats (file2 )
106+
107+ print (f"\n Comparison: { file1 } vs { file2 } " )
108+ print ("=" * 60 )
109+
110+ print (f"\n Mappings:" )
111+ print (f" { file1 :30s} : { stats1 ['total_mappings' ]:,} " )
112+ print (f" { file2 :30s} : { stats2 ['total_mappings' ]:,} " )
113+ diff = stats2 ['total_mappings' ] - stats1 ['total_mappings' ]
114+ pct = 100.0 * diff / stats1 ['total_mappings' ] if stats1 ['total_mappings' ] > 0 else 0
115+ print (f" { 'Change' :30s} : { diff :+,} ({ pct :+.1f} %)" )
116+
117+ print (f"\n Inter-chromosomal mappings:" )
118+ print (f" { file1 :30s} : { stats1 ['inter_chromosomal' ]:,} " )
119+ print (f" { file2 :30s} : { stats2 ['inter_chromosomal' ]:,} " )
120+ diff = stats2 ['inter_chromosomal' ] - stats1 ['inter_chromosomal' ]
121+ pct = 100.0 * diff / stats1 ['inter_chromosomal' ] if stats1 ['inter_chromosomal' ] > 0 else 0
122+ print (f" { 'Change' :30s} : { diff :+,} ({ pct :+.1f} %)" )
123+
124+ print (f"\n Chromosome pairs:" )
125+ print (f" { file1 :30s} : { stats1 ['chr_pairs' ]:,} " )
126+ print (f" { file2 :30s} : { stats2 ['chr_pairs' ]:,} " )
127+ diff = stats2 ['chr_pairs' ] - stats1 ['chr_pairs' ]
128+ pct = 100.0 * diff / stats1 ['chr_pairs' ] if stats1 ['chr_pairs' ] > 0 else 0
129+ print (f" { 'Change' :30s} : { diff :+,} ({ pct :+.1f} %)" )
130+
131+ print (f"\n Average genome pair coverage:" )
132+ print (f" { file1 :30s} : { stats1 ['avg_coverage' ]:.1f} %" )
133+ print (f" { file2 :30s} : { stats2 ['avg_coverage' ]:.1f} %" )
134+ diff = stats2 ['avg_coverage' ] - stats1 ['avg_coverage' ]
135+ print (f" { 'Change' :30s} : { diff :+.1f} %" )
136+
137+ print (f"\n Genome pairs with >95% coverage:" )
138+ print (f" { file1 :30s} : { stats1 ['coverages_above_95' ]} /{ stats1 ['total_genome_pairs' ]} " )
139+ print (f" { file2 :30s} : { stats2 ['coverages_above_95' ]} /{ stats2 ['total_genome_pairs' ]} " )
140+
141+ if __name__ == "__main__" :
142+ if len (sys .argv ) == 2 :
143+ # Single file stats
144+ stats = get_paf_stats (sys .argv [1 ])
145+ print (f"\n Statistics for { sys .argv [1 ]} :" )
146+ print ("=" * 40 )
147+ print (f"Total mappings: { stats ['total_mappings' ]:,} " )
148+ print (f"Total bases: { stats ['total_bases' ]:,} " )
149+ print (f"Self mappings: { stats ['self_mappings' ]:,} " )
150+ print (f"Inter-chromosomal: { stats ['inter_chromosomal' ]:,} " )
151+ print (f"Inter-genome: { stats ['inter_genome' ]:,} " )
152+ print (f"Chromosome pairs: { stats ['chr_pairs' ]:,} " )
153+ print (f"Average coverage: { stats ['avg_coverage' ]:.1f} %" )
154+ print (f"Pairs >95% coverage: { stats ['coverages_above_95' ]} /{ stats ['total_genome_pairs' ]} " )
155+ elif len (sys .argv ) == 3 :
156+ # Compare two files
157+ compare_paf_files (sys .argv [1 ], sys .argv [2 ])
158+ else :
159+ print ("Usage: python3 paf_stats.py <paf_file> [<paf_file2>]" )
160+ print (" Single file: show statistics" )
161+ print (" Two files: compare statistics" )
162+ sys .exit (1 )
0 commit comments