11import csv
22import io
33import logging
4+ from typing import TypeAlias
45
56from tola .assembly .assembly import Assembly , AssemblyDict
67
78log = logging .getLogger (__name__ )
89
910
11+ RankedNameLengths : TypeAlias = dict [int , dict [str , int ]]
12+
13+
1014class AssemblyStatsError (Exception ):
1115 """Error from AssemblyStats"""
1216
1317
1418class AssemblyStats :
15- def __init__ (self , autosome_prefix : str = "SUPER_" ) -> None :
19+ def __init__ (self , autosome_prefix : str = "SUPER_" ):
1620 self .autosome_prefix = autosome_prefix
1721 self .input_assembly : Assembly | None = None
1822 self .cuts = 0
19- self .breaks = 0
20- self .joins = 0
23+ self .breaks = None
24+ self .joins = None
25+ self .interventions_per_gbp = None
26+ self .percent_assembly_in_chromosomes = None
2127 self .per_assembly_stats = {}
2228 self .assembly_scaffold_lengths = {}
2329
24- def make_stats (self , output_assemblies : AssemblyDict ) -> None :
30+ def make_stats (self , output_assemblies : AssemblyDict ):
2531 if not self .input_assembly :
2632 msg = "Missing input_assembly attribute"
2733 raise AssemblyStatsError (msg )
34+ self .__build_junction_stats (output_assemblies )
35+ self .__build_length_stats (output_assemblies )
36+
37+ def __build_junction_stats (self , output_assemblies : AssemblyDict ):
2838
2939 # These stats are going to be wrong if re-curating assemblies with
3040 # Fragment names beginning with "SUPER_".
@@ -63,6 +73,25 @@ def make_stats(self, output_assemblies: AssemblyDict) -> None:
6373 "manual_joins" : len ((junc_set - input_asm_set ) & total_joins ),
6474 }
6575
76+ def __build_length_stats (self , output_assemblies : AssemblyDict ):
77+ input_asm_length = self .input_assembly .fragments_length
78+
79+ # Calculate the number of breaks and joins made per Gbp of the input assembly
80+ self .interventions_per_gbp = round (
81+ (self .breaks + self .joins ) / (input_asm_length / 1e9 ), 3
82+ )
83+
84+ # Calculate the percentage of the assembly that was placed in chromosomes
85+ tier_1_and_2_length = 0
86+ for hap , asm in output_assemblies .items ():
87+ ranked_scffld_lengths = self .get_assembly_scaffold_lengths (hap , asm )
88+ for rank in (1 , 2 ):
89+ if scffld_lengths := ranked_scffld_lengths .get (rank ):
90+ tier_1_and_2_length += sum (scffld_lengths .values ())
91+ self .percent_assembly_in_chromosomes = round (
92+ 100 * (tier_1_and_2_length / input_asm_length ), 1
93+ )
94+
6695 def log_curation_stats (self ):
6796 cut_plural = "cut in a contig" if self .cuts == 1 else "cuts in contigs"
6897 break_plural = "break at a gap" if self .breaks == 1 else "breaks at gaps"
@@ -71,6 +100,10 @@ def log_curation_stats(self):
71100 f"Curation made { self .cuts } { cut_plural } , { self .breaks } "
72101 f" { break_plural } and { self .joins } { join_plural } "
73102 )
103+ log .info (
104+ f"Assembly placed in chromosomes = { self .percent_assembly_in_chromosomes } %"
105+ )
106+ log .info (f"Interventions per Gbp = { self .interventions_per_gbp } " )
74107
75108 def ranked_scaffolds (self , asm : Assembly ):
76109 ranked_scaffolds = {}
@@ -79,7 +112,7 @@ def ranked_scaffolds(self, asm: Assembly):
79112 ranked_scaffolds .setdefault (rank , []).append (scffld )
80113 return ranked_scaffolds
81114
82- def build_assembly_scaffold_lengths (self , asm : Assembly ):
115+ def build_assembly_scaffold_lengths (self , asm : Assembly ) -> RankedNameLengths :
83116 ranked_scaffolds = self .ranked_scaffolds (asm )
84117
85118 ranked_names_lengths = {}
@@ -105,7 +138,11 @@ def build_assembly_scaffold_lengths(self, asm: Assembly):
105138
106139 return ranked_names_lengths
107140
108- def get_assembly_scaffold_lengths (self , asm_key : str | None , asm : Assembly ):
141+ def get_assembly_scaffold_lengths (
142+ self ,
143+ asm_key : str | None ,
144+ asm : Assembly ,
145+ ) -> RankedNameLengths :
109146 scaff_lengths = self .assembly_scaffold_lengths .get (asm_key )
110147 if not scaff_lengths :
111148 scaff_lengths = self .assembly_scaffold_lengths [asm_key ] = (
@@ -250,7 +287,7 @@ def log_sanity_checks(self, hap_asm: dict[str | None, Assembly]) -> None:
250287 log .warning (msg )
251288
252289 def check_consistent_autosome_count (
253- self , hap_asm : dict [ str | None , Assembly ]
290+ self , hap_asm : AssemblyDict
254291 ) -> list [str ] | None :
255292 chr_counts = {}
256293 for hap , asm in hap_asm .items ():
@@ -267,9 +304,7 @@ def check_consistent_autosome_count(
267304 ]
268305 return None
269306
270- def check_for_large_haplotigs (
271- self , hap_asm : dict [str | None , Assembly ]
272- ) -> list [str ] | None :
307+ def check_for_large_haplotigs (self , hap_asm : AssemblyDict ) -> list [str ] | None :
273308 htigs = hap_asm .get ("Haplotig" )
274309 if not htigs :
275310 return None
0 commit comments