@@ -884,6 +884,8 @@ def __init__(
884884 alpha_len : float = 1.0 ,
885885 alpha_cov : float = 1.0 ,
886886 alpha_min : float = 0.2 ,
887+ reference_protein : str = None ,
888+ stats_output_folder : str = None ,
887889 ):
888890 if mode not in ["greedy" , "dbg" , "dbg_weighted" , "dbgX" , "fusion" , "multimodal_dbg" , "hybrid_dbg" ]:
889891 raise ValueError (
@@ -902,12 +904,37 @@ def __init__(
902904 self .alpha_len = alpha_len
903905 self .alpha_cov = alpha_cov
904906 self .alpha_min = alpha_min
907+ self .reference_protein = reference_protein
908+ self .stats_output_folder = stats_output_folder
909+
910+ def _compute_intermediate_stats (self , contigs , label ):
911+ """Internal wrapper for statistics."""
912+ if self .reference_protein and self .stats_output_folder :
913+ logger .info (f"Computing intermediate statistics for { label } ..." )
914+ try :
915+ mapped = viz .process_protein_contigs_scaffold (
916+ assembled_contigs = contigs ,
917+ target_protein = self .reference_protein ,
918+ max_mismatches = self .max_mismatches ,
919+ min_identity = self .min_identity
920+ )
921+ df_mapped = viz .create_dataframe_from_mapped_sequences (data = mapped )
922+ if not df_mapped .empty :
923+ helpers .compute_assembly_statistics (
924+ df = df_mapped ,
925+ sequence_type = label ,
926+ output_folder = self .stats_output_folder ,
927+ reference = self .reference_protein ,
928+ )
929+ except Exception as e :
930+ logger .warning (f"Could not compute intermediate stats: { e } " )
905931
906932 def assemble_greedy (self , sequences ):
907933 logger .info (f"[Assembler] Running Greedy assembly (min_overlap={ self .min_overlap } )" )
908934 contigs = assemble_contigs_greedy (sequences , self .min_overlap )
909935 contigs = list (set (contigs ))
910936 contigs = sorted (contigs , key = len , reverse = True )
937+ self ._compute_intermediate_stats (contigs , label = "contig" )
911938
912939 scaffolds = scaffold_iterative_greedy (contigs , self .min_overlap , self .size_threshold )
913940
@@ -949,6 +976,8 @@ def assemble_dbg_weighted(self, sequences: List[str]) -> List[str]:
949976
950977 logger .info (f"DBG produced { len (contigs )} initial contigs." )
951978
979+ self ._compute_intermediate_stats (contigs , label = "contig" )
980+
952981 # 2. OVERLAP GRAPH REFINEMENT (The new logic)
953982 # Only runs if refine_rounds is > 0
954983 if self .refine_rounds > 0 :
@@ -1265,21 +1294,29 @@ def main(
12651294 refine_rounds : int = 0 ,
12661295):
12671296 """Main function for standalone assembly."""
1297+ output_path = Path (output_scaffolds_path )
1298+ output_path .parent .mkdir (parents = True , exist_ok = True )
1299+ stats_folder = output_path .parent / "statistics"
12681300
12691301 protein_norm = None # None means no reference mode
1302+
12701303 if reference :
12711304 logger .info ("Reference mode enabled. Loading reference protein..." )
12721305 if not metadata_json_path :
1273- raise ValueError ("metadata_json_path is required when reference mode is enabled." )
1274-
1306+ raise ValueError ("metadata_json_path is required." )
12751307 try :
1276- run_stem = Path (input_csv_path ).stem # extract run name from input file
1277- run_name = run_stem .replace ("_cleaned" , "" )
1278-
1308+ path_obj = Path (input_csv_path )
1309+
1310+ if path_obj .name == "cleaned.csv" :
1311+ run_name = path_obj .parent .parent .name
1312+ else :
1313+ run_name = path_obj .stem .replace ("_cleaned" , "" )
1314+
12791315 meta = helpers .get_sample_metadata (run = run_name , chain = chain , json_path = metadata_json_path )
12801316 protein = meta ["protein" ]
12811317 protein_norm = preprocessing .normalize_sequence (protein )
12821318 logger .info ("Reference protein loaded and normalized successfully." )
1319+ stats_folder .mkdir (parents = True , exist_ok = True )
12831320
12841321 except Exception as e :
12851322 logger .error (f"Failed to get reference protein: { e } " )
@@ -1306,6 +1343,8 @@ def main(
13061343 min_identity = min_identity ,
13071344 max_mismatches = max_mismatches ,
13081345 refine_rounds = refine_rounds ,
1346+ reference_protein = protein_norm ,
1347+ stats_output_folder = str (stats_folder ) if protein_norm else None
13091348 )
13101349
13111350 scaffolds = assembler .run (sequences = sequences , df_full = df )
0 commit comments