1+ #!/usr/bin/env python3
2+ """
3+ Analyze chromosome pair coverage in PAF files.
4+ For yeast genomes, we expect most homologous chromosomes to have 1:1 mappings.
5+ """
6+
7+ import sys
8+ from collections import defaultdict
9+
10+ def parse_paf_line (line ):
11+ """Parse a PAF line and extract query, target, and lengths."""
12+ fields = line .strip ().split ('\t ' )
13+ if len (fields ) < 12 :
14+ return None
15+
16+ query = fields [0 ]
17+ query_len = int (fields [1 ])
18+ query_start = int (fields [2 ])
19+ query_end = int (fields [3 ])
20+ target = fields [5 ]
21+ target_len = int (fields [6 ])
22+ target_start = int (fields [7 ])
23+ target_end = int (fields [8 ])
24+
25+ return {
26+ 'query' : query ,
27+ 'target' : target ,
28+ 'query_len' : query_len ,
29+ 'target_len' : target_len ,
30+ 'query_cov' : (query_end - query_start ) / query_len ,
31+ 'target_cov' : (target_end - target_start ) / target_len ,
32+ }
33+
34+ def extract_genome_and_chr (seq_name ):
35+ """Extract genome and chromosome from PanSN format: genome#haplotype#chromosome"""
36+ parts = seq_name .split ('#' )
37+ if len (parts ) >= 3 :
38+ genome = f"{ parts [0 ]} #{ parts [1 ]} "
39+ chromosome = parts [2 ]
40+ else :
41+ genome = seq_name
42+ chromosome = seq_name
43+ return genome , chromosome
44+
45+ def analyze_coverage (paf_file ):
46+ """Analyze chromosome pair coverage in a PAF file."""
47+
48+ # Track alignments between chromosome pairs
49+ chr_pairs = defaultdict (list ) # (query_chr, target_chr) -> list of alignments
50+ genome_pairs = defaultdict (set ) # (query_genome, target_genome) -> set of chr pairs
51+
52+ with open (paf_file , 'r' ) as f :
53+ for line in f :
54+ aln = parse_paf_line (line )
55+ if not aln :
56+ continue
57+
58+ q_genome , q_chr = extract_genome_and_chr (aln ['query' ])
59+ t_genome , t_chr = extract_genome_and_chr (aln ['target' ])
60+
61+ # Skip self-mappings at genome level
62+ if q_genome == t_genome :
63+ continue
64+
65+ chr_pair = (aln ['query' ], aln ['target' ])
66+ chr_pairs [chr_pair ].append (aln )
67+
68+ genome_pair = (q_genome , t_genome )
69+ genome_pairs [genome_pair ].add ((q_chr , t_chr ))
70+
71+ # Analyze coverage for each chromosome pair
72+ print (f"\n Analyzing { paf_file } " )
73+ print ("=" * 80 )
74+
75+ # Count chromosome pairs by coverage level
76+ coverage_bins = defaultdict (int )
77+ chr_coverage = {}
78+
79+ for (q_chr , t_chr ), alignments in chr_pairs .items ():
80+ # Calculate total coverage
81+ q_genome , q_chr_name = extract_genome_and_chr (q_chr )
82+ t_genome , t_chr_name = extract_genome_and_chr (t_chr )
83+
84+ # Merge overlapping alignments to get true coverage
85+ total_q_cov = sum (a ['query_cov' ] for a in alignments )
86+ total_t_cov = sum (a ['target_cov' ] for a in alignments )
87+ avg_cov = (total_q_cov + total_t_cov ) / 2
88+
89+ chr_coverage [(q_chr , t_chr )] = avg_cov
90+
91+ # Bin the coverage
92+ if avg_cov < 0.1 :
93+ coverage_bins ['<10%' ] += 1
94+ elif avg_cov < 0.5 :
95+ coverage_bins ['10-50%' ] += 1
96+ elif avg_cov < 0.8 :
97+ coverage_bins ['50-80%' ] += 1
98+ elif avg_cov < 0.95 :
99+ coverage_bins ['80-95%' ] += 1
100+ elif avg_cov <= 1.05 :
101+ coverage_bins ['95-105% (1:1)' ] += 1
102+ else :
103+ coverage_bins ['>105%' ] += 1
104+
105+ # Print summary statistics
106+ print (f"\n Total chromosome pairs with alignments: { len (chr_pairs )} " )
107+ print (f"Total genome pairs: { len (genome_pairs )} " )
108+
109+ print ("\n Chromosome pair coverage distribution:" )
110+ for bin_name in ['<10%' , '10-50%' , '50-80%' , '80-95%' , '95-105% (1:1)' , '>105%' ]:
111+ count = coverage_bins [bin_name ]
112+ pct = 100 * count / max (1 , len (chr_pairs ))
113+ print (f" { bin_name :15s} : { count :4d} pairs ({ pct :5.1f} %)" )
114+
115+ # Find expected homologous pairs (same chromosome name)
116+ homologous_pairs = {}
117+ for (q_chr , t_chr ), cov in chr_coverage .items ():
118+ q_genome , q_chr_name = extract_genome_and_chr (q_chr )
119+ t_genome , t_chr_name = extract_genome_and_chr (t_chr )
120+
121+ if q_chr_name == t_chr_name : # Same chromosome (e.g., chrI to chrI)
122+ homologous_pairs [(q_chr , t_chr )] = cov
123+
124+ print (f"\n Homologous chromosome pairs (same chr name): { len (homologous_pairs )} " )
125+
126+ # Check coverage of homologous pairs
127+ good_coverage = sum (1 for cov in homologous_pairs .values () if 0.95 <= cov <= 1.05 )
128+ print (f" With ~100% coverage (0.95-1.05): { good_coverage } ({ 100 * good_coverage / max (1 ,len (homologous_pairs )):.1f} %)" )
129+
130+ # List problematic homologous pairs
131+ problematic = [(pair , cov ) for pair , cov in homologous_pairs .items () if cov < 0.95 or cov > 1.05 ]
132+ if problematic :
133+ print ("\n Problematic homologous pairs (coverage != ~1.0):" )
134+ for (q_chr , t_chr ), cov in sorted (problematic , key = lambda x : x [1 ])[:10 ]:
135+ q_genome , q_chr_name = extract_genome_and_chr (q_chr )
136+ t_genome , t_chr_name = extract_genome_and_chr (t_chr )
137+ print (f" { q_genome } { q_chr_name } -> { t_genome } { t_chr_name } : { cov :.1%} coverage" )
138+
139+ # Check for missing homologous pairs
140+ all_queries = set ()
141+ all_targets = set ()
142+ for q_chr , t_chr in chr_pairs :
143+ all_queries .add (q_chr )
144+ all_targets .add (t_chr )
145+
146+ print (f"\n Unique query sequences: { len (all_queries )} " )
147+ print (f"Unique target sequences: { len (all_targets )} " )
148+
149+ # For each query chromosome, check if it has alignments to the same chromosome in other genomes
150+ missing_homologs = []
151+ for q_chr in all_queries :
152+ q_genome , q_chr_name = extract_genome_and_chr (q_chr )
153+ has_homolog = False
154+
155+ for t_chr in all_targets :
156+ t_genome , t_chr_name = extract_genome_and_chr (t_chr )
157+
158+ if q_genome != t_genome and q_chr_name == t_chr_name :
159+ if (q_chr , t_chr ) in chr_pairs :
160+ has_homolog = True
161+ break
162+
163+ if not has_homolog :
164+ missing_homologs .append (q_chr )
165+
166+ if missing_homologs :
167+ print (f"\n Chromosomes missing homologous alignments: { len (missing_homologs )} " )
168+ for chr_name in missing_homologs [:10 ]:
169+ genome , chr = extract_genome_and_chr (chr_name )
170+ print (f" { genome } { chr } " )
171+
172+ return chr_pairs , genome_pairs , chr_coverage
173+
174+ if __name__ == "__main__" :
175+ if len (sys .argv ) < 2 :
176+ print ("Usage: python analyze_coverage.py <paf_file> [<paf_file2> ...]" )
177+ sys .exit (1 )
178+
179+ for paf_file in sys .argv [1 :]:
180+ analyze_coverage (paf_file )
0 commit comments