Skip to content

Commit cc47edb

Browse files
committed
Interventions per Gbp and percent assembly in chromosomes stats
1 parent 8a7cab7 commit cc47edb

26 files changed

+96
-11
lines changed

src/tola/assembly/assembly_stats.py

Lines changed: 45 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,30 +1,40 @@
11
import csv
22
import io
33
import logging
4+
from typing import TypeAlias
45

56
from tola.assembly.assembly import Assembly, AssemblyDict
67

78
log = logging.getLogger(__name__)
89

910

11+
RankedNameLengths: TypeAlias = dict[int, dict[str, int]]
12+
13+
1014
class AssemblyStatsError(Exception):
1115
"""Error from AssemblyStats"""
1216

1317

1418
class AssemblyStats:
15-
def __init__(self, autosome_prefix: str = "SUPER_") -> None:
19+
def __init__(self, autosome_prefix: str = "SUPER_"):
1620
self.autosome_prefix = autosome_prefix
1721
self.input_assembly: Assembly | None = None
1822
self.cuts = 0
19-
self.breaks = 0
20-
self.joins = 0
23+
self.breaks = None
24+
self.joins = None
25+
self.interventions_per_gbp = None
26+
self.percent_assembly_in_chromosomes = None
2127
self.per_assembly_stats = {}
2228
self.assembly_scaffold_lengths = {}
2329

24-
def make_stats(self, output_assemblies: AssemblyDict) -> None:
30+
def make_stats(self, output_assemblies: AssemblyDict):
2531
if not self.input_assembly:
2632
msg = "Missing input_assembly attribute"
2733
raise AssemblyStatsError(msg)
34+
self.__build_junction_stats(output_assemblies)
35+
self.__build_length_stats(output_assemblies)
36+
37+
def __build_junction_stats(self, output_assemblies: AssemblyDict):
2838

2939
# These stats are going to be wrong if re-curating assemblies with
3040
# Fragment names beginning with "SUPER_".
@@ -63,6 +73,25 @@ def make_stats(self, output_assemblies: AssemblyDict) -> None:
6373
"manual_joins": len((junc_set - input_asm_set) & total_joins),
6474
}
6575

76+
def __build_length_stats(self, output_assemblies: AssemblyDict):
77+
input_asm_length = self.input_assembly.fragments_length
78+
79+
# Calculate the number of breaks and joins made per Gbp of the input assembly
80+
self.interventions_per_gbp = round(
81+
(self.breaks + self.joins) / (input_asm_length / 1e9), 3
82+
)
83+
84+
# Calculate the percentage of the assembly that was placed in chromosomes
85+
tier_1_and_2_length = 0
86+
for hap, asm in output_assemblies.items():
87+
ranked_scffld_lengths = self.get_assembly_scaffold_lengths(hap, asm)
88+
for rank in (1, 2):
89+
if scffld_lengths := ranked_scffld_lengths.get(rank):
90+
tier_1_and_2_length += sum(scffld_lengths.values())
91+
self.percent_assembly_in_chromosomes = round(
92+
100 * (tier_1_and_2_length / input_asm_length), 1
93+
)
94+
6695
def log_curation_stats(self):
6796
cut_plural = "cut in a contig" if self.cuts == 1 else "cuts in contigs"
6897
break_plural = "break at a gap" if self.breaks == 1 else "breaks at gaps"
@@ -71,6 +100,10 @@ def log_curation_stats(self):
71100
f"Curation made {self.cuts} {cut_plural}, {self.breaks}"
72101
f" {break_plural} and {self.joins} {join_plural}"
73102
)
103+
log.info(
104+
f"Assembly placed in chromosomes = {self.percent_assembly_in_chromosomes}%"
105+
)
106+
log.info(f"Interventions per Gbp = {self.interventions_per_gbp}")
74107

75108
def ranked_scaffolds(self, asm: Assembly):
76109
ranked_scaffolds = {}
@@ -79,7 +112,7 @@ def ranked_scaffolds(self, asm: Assembly):
79112
ranked_scaffolds.setdefault(rank, []).append(scffld)
80113
return ranked_scaffolds
81114

82-
def build_assembly_scaffold_lengths(self, asm: Assembly):
115+
def build_assembly_scaffold_lengths(self, asm: Assembly) -> RankedNameLengths:
83116
ranked_scaffolds = self.ranked_scaffolds(asm)
84117

85118
ranked_names_lengths = {}
@@ -105,7 +138,11 @@ def build_assembly_scaffold_lengths(self, asm: Assembly):
105138

106139
return ranked_names_lengths
107140

108-
def get_assembly_scaffold_lengths(self, asm_key: str | None, asm: Assembly):
141+
def get_assembly_scaffold_lengths(
142+
self,
143+
asm_key: str | None,
144+
asm: Assembly,
145+
) -> RankedNameLengths:
109146
scaff_lengths = self.assembly_scaffold_lengths.get(asm_key)
110147
if not scaff_lengths:
111148
scaff_lengths = self.assembly_scaffold_lengths[asm_key] = (
@@ -250,7 +287,7 @@ def log_sanity_checks(self, hap_asm: dict[str | None, Assembly]) -> None:
250287
log.warning(msg)
251288

252289
def check_consistent_autosome_count(
253-
self, hap_asm: dict[str | None, Assembly]
290+
self, hap_asm: AssemblyDict
254291
) -> list[str] | None:
255292
chr_counts = {}
256293
for hap, asm in hap_asm.items():
@@ -267,9 +304,7 @@ def check_consistent_autosome_count(
267304
]
268305
return None
269306

270-
def check_for_large_haplotigs(
271-
self, hap_asm: dict[str | None, Assembly]
272-
) -> list[str] | None:
307+
def check_for_large_haplotigs(self, hap_asm: AssemblyDict) -> list[str] | None:
273308
htigs = hap_asm.get("Haplotig")
274309
if not htigs:
275310
return None

src/tola/assembly/scripts/pretext_to_asm.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -481,7 +481,7 @@ def write_chr_csv_files(
481481
csv_fh.write(chr_names)
482482

483483

484-
def write_info_yaml(output_file, stats, out_assemblies: AssemblyDict, clobber):
484+
def write_info_yaml(output_file, stats: AssemblyStats, out_assemblies: AssemblyDict, clobber):
485485
asm_stats = stats.per_assembly_stats
486486
info = {"assemblies": asm_stats}
487487
if len(asm_stats) > 1:
@@ -492,6 +492,8 @@ def write_info_yaml(output_file, stats, out_assemblies: AssemblyDict, clobber):
492492
if h_asm := out_assemblies.get("Haplotig"):
493493
haplotig_count = len(h_asm.scaffolds)
494494
info["manual_haplotig_removals"] = haplotig_count
495+
info["percent_assembly_in_chromosomes"] = stats.percent_assembly_in_chromosomes
496+
info["interventions_per_gbp"] = stats.interventions_per_gbp
495497

496498
yaml_file = output_file.with_name(output_file.stem + ".info.yaml")
497499
with get_output_filehandle(yaml_file, clobber) as yaml_fh:

tests/data/bChlMac1_3/bChlMac1-pretext-to-tpf.3.info.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,3 +8,5 @@ assemblies:
88
manual_breaks: 15
99
manual_joins: 29
1010
manual_haplotig_removals: 2
11+
percent_assembly_in_chromosomes: 95.6
12+
interventions_per_gbp: 18.446

tests/data/bChlMac1_3/bChlMac1-pretext-to-tpf.3.log

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,5 +50,7 @@ bChlMac1-pretext-to-tpf.3.haplotigs
5050
4,748,144 bp total
5151

5252
Curation made 1 cut in a contig, 15 breaks at gaps and 29 joins
53+
Assembly placed in chromosomes = 95.6%
54+
Interventions per Gbp = 18.446
5355
Mismatch in autosome count between Hap1 = 36 and Hap2 = 35
5456
Haplotig H_1 (Scaffold_75) is 4,560,100 bp which is longer than the shortest chromosome (292,585 bp)

tests/data/csSphGirg1/csSphGirg1-pretext-to-tpf.info.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,3 +3,5 @@ assemblies:
33
manual_breaks: 3
44
manual_joins: 4
55
manual_haplotig_removals: 0
6+
percent_assembly_in_chromosomes: 96.6
7+
interventions_per_gbp: 15.651

tests/data/csSphGirg1/csSphGirg1-pretext-to-tpf.log

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,3 +19,5 @@ csSphGirg1-pretext-to-tpf.1.contaminants
1919
15,413,452 bp total
2020

2121
Curation made 0 cuts in contigs, 3 breaks at gaps and 4 joins
22+
Assembly placed in chromosomes = 96.6%
23+
Interventions per Gbp = 15.651

tests/data/eaAstIrre1/eaAstIrre1-pretext-to-tpf.info.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,3 +3,5 @@ assemblies:
33
manual_breaks: 71
44
manual_joins: 97
55
manual_haplotig_removals: 44
6+
percent_assembly_in_chromosomes: 93.4
7+
interventions_per_gbp: 336.493

tests/data/eaAstIrre1/eaAstIrre1-pretext-to-tpf.log

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -148,3 +148,5 @@ eaAstIrre1-pretext-to-tpf.1.contaminants
148148
2,742,977 bp total
149149

150150
Curation made 20 cuts in contigs, 71 breaks at gaps and 97 joins
151+
Assembly placed in chromosomes = 93.4%
152+
Interventions per Gbp = 336.493

tests/data/idDilFebr1/idDilFebr1-pretext-to-tpf.info.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,3 +3,5 @@ assemblies:
33
manual_breaks: 20
44
manual_joins: 31
55
manual_haplotig_removals: 7
6+
percent_assembly_in_chromosomes: 93.8
7+
interventions_per_gbp: 163.495

tests/data/idDilFebr1/idDilFebr1-pretext-to-tpf.log

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,3 +28,5 @@ idDilFebr1-pretext-to-tpf.1.additional_haplotigs
2828
3,119,814 bp total
2929

3030
Curation made 0 cuts in contigs, 20 breaks at gaps and 31 joins
31+
Assembly placed in chromosomes = 93.8%
32+
Interventions per Gbp = 163.495

0 commit comments

Comments
 (0)