Skip to content

Commit abcf83c

Browse files
committed
Change default to no initial mapping filter and add coverage analysis
Analysis showed that 1:1 filtering on initial mappings was too aggressive, keeping only the longest alignment per chromosome pair regardless of identity. This resulted in poor coverage of homologous chromosomes in yeast (only 10% with good coverage). With no initial filter (letting scaffolding handle selection): - 952 homologous chromosome pairs identified - 391 (41%) have ~100% coverage - Much better than 39 (10%) with 1:1 initial filter The scaffolding step with 1:1 filter handles selection better as it considers chain context rather than individual alignments. Added analyze_coverage.py script to verify chromosome pair coverage.
1 parent 6a4d959 commit abcf83c

File tree

2 files changed

+181
-1
lines changed

2 files changed

+181
-1
lines changed

analyze_coverage.py

Lines changed: 180 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,180 @@
1+
#!/usr/bin/env python3
2+
"""
3+
Analyze chromosome pair coverage in PAF files.
4+
For yeast genomes, we expect most homologous chromosomes to have 1:1 mappings.
5+
"""
6+
7+
import sys
8+
from collections import defaultdict
9+
10+
def parse_paf_line(line):
11+
"""Parse a PAF line and extract query, target, and lengths."""
12+
fields = line.strip().split('\t')
13+
if len(fields) < 12:
14+
return None
15+
16+
query = fields[0]
17+
query_len = int(fields[1])
18+
query_start = int(fields[2])
19+
query_end = int(fields[3])
20+
target = fields[5]
21+
target_len = int(fields[6])
22+
target_start = int(fields[7])
23+
target_end = int(fields[8])
24+
25+
return {
26+
'query': query,
27+
'target': target,
28+
'query_len': query_len,
29+
'target_len': target_len,
30+
'query_cov': (query_end - query_start) / query_len,
31+
'target_cov': (target_end - target_start) / target_len,
32+
}
33+
34+
def extract_genome_and_chr(seq_name):
35+
"""Extract genome and chromosome from PanSN format: genome#haplotype#chromosome"""
36+
parts = seq_name.split('#')
37+
if len(parts) >= 3:
38+
genome = f"{parts[0]}#{parts[1]}"
39+
chromosome = parts[2]
40+
else:
41+
genome = seq_name
42+
chromosome = seq_name
43+
return genome, chromosome
44+
45+
def analyze_coverage(paf_file):
46+
"""Analyze chromosome pair coverage in a PAF file."""
47+
48+
# Track alignments between chromosome pairs
49+
chr_pairs = defaultdict(list) # (query_chr, target_chr) -> list of alignments
50+
genome_pairs = defaultdict(set) # (query_genome, target_genome) -> set of chr pairs
51+
52+
with open(paf_file, 'r') as f:
53+
for line in f:
54+
aln = parse_paf_line(line)
55+
if not aln:
56+
continue
57+
58+
q_genome, q_chr = extract_genome_and_chr(aln['query'])
59+
t_genome, t_chr = extract_genome_and_chr(aln['target'])
60+
61+
# Skip self-mappings at genome level
62+
if q_genome == t_genome:
63+
continue
64+
65+
chr_pair = (aln['query'], aln['target'])
66+
chr_pairs[chr_pair].append(aln)
67+
68+
genome_pair = (q_genome, t_genome)
69+
genome_pairs[genome_pair].add((q_chr, t_chr))
70+
71+
# Analyze coverage for each chromosome pair
72+
print(f"\nAnalyzing {paf_file}")
73+
print("=" * 80)
74+
75+
# Count chromosome pairs by coverage level
76+
coverage_bins = defaultdict(int)
77+
chr_coverage = {}
78+
79+
for (q_chr, t_chr), alignments in chr_pairs.items():
80+
# Calculate total coverage
81+
q_genome, q_chr_name = extract_genome_and_chr(q_chr)
82+
t_genome, t_chr_name = extract_genome_and_chr(t_chr)
83+
84+
# Merge overlapping alignments to get true coverage
85+
total_q_cov = sum(a['query_cov'] for a in alignments)
86+
total_t_cov = sum(a['target_cov'] for a in alignments)
87+
avg_cov = (total_q_cov + total_t_cov) / 2
88+
89+
chr_coverage[(q_chr, t_chr)] = avg_cov
90+
91+
# Bin the coverage
92+
if avg_cov < 0.1:
93+
coverage_bins['<10%'] += 1
94+
elif avg_cov < 0.5:
95+
coverage_bins['10-50%'] += 1
96+
elif avg_cov < 0.8:
97+
coverage_bins['50-80%'] += 1
98+
elif avg_cov < 0.95:
99+
coverage_bins['80-95%'] += 1
100+
elif avg_cov <= 1.05:
101+
coverage_bins['95-105% (1:1)'] += 1
102+
else:
103+
coverage_bins['>105%'] += 1
104+
105+
# Print summary statistics
106+
print(f"\nTotal chromosome pairs with alignments: {len(chr_pairs)}")
107+
print(f"Total genome pairs: {len(genome_pairs)}")
108+
109+
print("\nChromosome pair coverage distribution:")
110+
for bin_name in ['<10%', '10-50%', '50-80%', '80-95%', '95-105% (1:1)', '>105%']:
111+
count = coverage_bins[bin_name]
112+
pct = 100 * count / max(1, len(chr_pairs))
113+
print(f" {bin_name:15s}: {count:4d} pairs ({pct:5.1f}%)")
114+
115+
# Find expected homologous pairs (same chromosome name)
116+
homologous_pairs = {}
117+
for (q_chr, t_chr), cov in chr_coverage.items():
118+
q_genome, q_chr_name = extract_genome_and_chr(q_chr)
119+
t_genome, t_chr_name = extract_genome_and_chr(t_chr)
120+
121+
if q_chr_name == t_chr_name: # Same chromosome (e.g., chrI to chrI)
122+
homologous_pairs[(q_chr, t_chr)] = cov
123+
124+
print(f"\nHomologous chromosome pairs (same chr name): {len(homologous_pairs)}")
125+
126+
# Check coverage of homologous pairs
127+
good_coverage = sum(1 for cov in homologous_pairs.values() if 0.95 <= cov <= 1.05)
128+
print(f" With ~100% coverage (0.95-1.05): {good_coverage} ({100*good_coverage/max(1,len(homologous_pairs)):.1f}%)")
129+
130+
# List problematic homologous pairs
131+
problematic = [(pair, cov) for pair, cov in homologous_pairs.items() if cov < 0.95 or cov > 1.05]
132+
if problematic:
133+
print("\n Problematic homologous pairs (coverage != ~1.0):")
134+
for (q_chr, t_chr), cov in sorted(problematic, key=lambda x: x[1])[:10]:
135+
q_genome, q_chr_name = extract_genome_and_chr(q_chr)
136+
t_genome, t_chr_name = extract_genome_and_chr(t_chr)
137+
print(f" {q_genome} {q_chr_name} -> {t_genome} {t_chr_name}: {cov:.1%} coverage")
138+
139+
# Check for missing homologous pairs
140+
all_queries = set()
141+
all_targets = set()
142+
for q_chr, t_chr in chr_pairs:
143+
all_queries.add(q_chr)
144+
all_targets.add(t_chr)
145+
146+
print(f"\nUnique query sequences: {len(all_queries)}")
147+
print(f"Unique target sequences: {len(all_targets)}")
148+
149+
# For each query chromosome, check if it has alignments to the same chromosome in other genomes
150+
missing_homologs = []
151+
for q_chr in all_queries:
152+
q_genome, q_chr_name = extract_genome_and_chr(q_chr)
153+
has_homolog = False
154+
155+
for t_chr in all_targets:
156+
t_genome, t_chr_name = extract_genome_and_chr(t_chr)
157+
158+
if q_genome != t_genome and q_chr_name == t_chr_name:
159+
if (q_chr, t_chr) in chr_pairs:
160+
has_homolog = True
161+
break
162+
163+
if not has_homolog:
164+
missing_homologs.append(q_chr)
165+
166+
if missing_homologs:
167+
print(f"\nChromosomes missing homologous alignments: {len(missing_homologs)}")
168+
for chr_name in missing_homologs[:10]:
169+
genome, chr = extract_genome_and_chr(chr_name)
170+
print(f" {genome} {chr}")
171+
172+
return chr_pairs, genome_pairs, chr_coverage
173+
174+
if __name__ == "__main__":
175+
if len(sys.argv) < 2:
176+
print("Usage: python analyze_coverage.py <paf_file> [<paf_file2> ...]")
177+
sys.exit(1)
178+
179+
for paf_file in sys.argv[1:]:
180+
analyze_coverage(paf_file)

src/main.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@ struct Args {
8282
sparsify: f64,
8383

8484
/// Mapping filter: "1:1" (best), "M:N" (top M per query, N per target; ∞/many for unbounded), "many" (no filter)
85-
#[clap(short = 'n', long = "num-mappings", default_value = "1:1")]
85+
#[clap(short = 'n', long = "num-mappings", default_value = "many")]
8686
num_mappings: String,
8787

8888
/// Scaffold filter: "1:1" (best), "M:N" (top M per query, N per target; ∞/many for unbounded)

0 commit comments

Comments
 (0)