Skip to content

Commit bc3d6f2

Browse files
modified files for enabling statistics
1 parent 481f872 commit bc3d6f2

File tree

2 files changed

+89
-11
lines changed

2 files changed

+89
-11
lines changed

src/instanexus/assembly.py

Lines changed: 44 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -884,6 +884,8 @@ def __init__(
884884
alpha_len: float = 1.0,
885885
alpha_cov: float = 1.0,
886886
alpha_min: float = 0.2,
887+
reference_protein: str = None,
888+
stats_output_folder: str = None,
887889
):
888890
if mode not in ["greedy", "dbg", "dbg_weighted", "dbgX", "fusion", "multimodal_dbg", "hybrid_dbg"]:
889891
raise ValueError(
@@ -902,12 +904,37 @@ def __init__(
902904
self.alpha_len = alpha_len
903905
self.alpha_cov = alpha_cov
904906
self.alpha_min = alpha_min
907+
self.reference_protein = reference_protein
908+
self.stats_output_folder = stats_output_folder
909+
910+
def _compute_intermediate_stats(self, contigs, label):
911+
"""Internal wrapper for statistics."""
912+
if self.reference_protein and self.stats_output_folder:
913+
logger.info(f"Computing intermediate statistics for {label}...")
914+
try:
915+
mapped = viz.process_protein_contigs_scaffold(
916+
assembled_contigs=contigs,
917+
target_protein=self.reference_protein,
918+
max_mismatches=self.max_mismatches,
919+
min_identity=self.min_identity
920+
)
921+
df_mapped = viz.create_dataframe_from_mapped_sequences(data=mapped)
922+
if not df_mapped.empty:
923+
helpers.compute_assembly_statistics(
924+
df=df_mapped,
925+
sequence_type=label,
926+
output_folder=self.stats_output_folder,
927+
reference=self.reference_protein,
928+
)
929+
except Exception as e:
930+
logger.warning(f"Could not compute intermediate stats: {e}")
905931

906932
def assemble_greedy(self, sequences):
907933
logger.info(f"[Assembler] Running Greedy assembly (min_overlap={self.min_overlap})")
908934
contigs = assemble_contigs_greedy(sequences, self.min_overlap)
909935
contigs = list(set(contigs))
910936
contigs = sorted(contigs, key=len, reverse=True)
937+
self._compute_intermediate_stats(contigs, label="contig")
911938

912939
scaffolds = scaffold_iterative_greedy(contigs, self.min_overlap, self.size_threshold)
913940

@@ -949,6 +976,8 @@ def assemble_dbg_weighted(self, sequences: List[str]) -> List[str]:
949976

950977
logger.info(f"DBG produced {len(contigs)} initial contigs.")
951978

979+
self._compute_intermediate_stats(contigs, label="contig")
980+
952981
# 2. OVERLAP GRAPH REFINEMENT (The new logic)
953982
# Only runs if refine_rounds is > 0
954983
if self.refine_rounds > 0:
@@ -1265,21 +1294,29 @@ def main(
12651294
refine_rounds: int = 0,
12661295
):
12671296
"""Main function for standalone assembly."""
1297+
output_path = Path(output_scaffolds_path)
1298+
output_path.parent.mkdir(parents=True, exist_ok=True)
1299+
stats_folder = output_path.parent / "statistics"
12681300

12691301
protein_norm = None # None means no reference mode
1302+
12701303
if reference:
12711304
logger.info("Reference mode enabled. Loading reference protein...")
12721305
if not metadata_json_path:
1273-
raise ValueError("metadata_json_path is required when reference mode is enabled.")
1274-
1306+
raise ValueError("metadata_json_path is required.")
12751307
try:
1276-
run_stem = Path(input_csv_path).stem # extract run name from input file
1277-
run_name = run_stem.replace("_cleaned", "")
1278-
1308+
path_obj = Path(input_csv_path)
1309+
1310+
if path_obj.name == "cleaned.csv":
1311+
run_name = path_obj.parent.parent.name
1312+
else:
1313+
run_name = path_obj.stem.replace("_cleaned", "")
1314+
12791315
meta = helpers.get_sample_metadata(run=run_name, chain=chain, json_path=metadata_json_path)
12801316
protein = meta["protein"]
12811317
protein_norm = preprocessing.normalize_sequence(protein)
12821318
logger.info("Reference protein loaded and normalized successfully.")
1319+
stats_folder.mkdir(parents=True, exist_ok=True)
12831320

12841321
except Exception as e:
12851322
logger.error(f"Failed to get reference protein: {e}")
@@ -1306,6 +1343,8 @@ def main(
13061343
min_identity=min_identity,
13071344
max_mismatches=max_mismatches,
13081345
refine_rounds=refine_rounds,
1346+
reference_protein=protein_norm,
1347+
stats_output_folder=str(stats_folder) if protein_norm else None
13091348
)
13101349

13111350
scaffolds = assembler.run(sequences=sequences, df_full=df)

src/instanexus/main.py

Lines changed: 45 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -22,9 +22,10 @@
2222

2323
import argparse
2424
import logging
25+
import pandas as pd
2526
from pathlib import Path
2627

27-
from . import alignment, assembly, clustering, consensus, preprocessing
28+
from . import alignment, assembly, clustering, consensus, preprocessing, helpers, visualization as viz
2829

2930
# Setup logging
3031
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
@@ -164,6 +165,9 @@ def run_pipeline(args):
164165
# Build the experiment folder name based on parameters
165166
folder_name_parts = [f"{args.assembly_mode}"]
166167

168+
if args.chain:
169+
folder_name_parts.append(f"{args.chain}")
170+
167171
if args.fdr is not None:
168172
folder_name_parts.append(f"fdr{args.fdr}")
169173
elif args.conf is not None:
@@ -172,11 +176,11 @@ def run_pipeline(args):
172176
if "dbg" in args.assembly_mode:
173177
folder_name_parts.append(f"ks{args.kmer_size}")
174178

175-
folder_name_parts.append(f"mo{args.min_overlap}")
176-
folder_name_parts.append(f"ts{args.size_threshold}")
179+
# folder_name_parts.append(f"mo{args.min_overlap}")
180+
# folder_name_parts.append(f"ts{args.size_threshold}")
177181

178-
if args.reference:
179-
folder_name_parts.extend([f"mi{args.min_identity}", f"mm{args.max_mismatches}"])
182+
# if args.reference:
183+
# folder_name_parts.extend([f"mi{args.min_identity}", f"mm{args.max_mismatches}"])
180184

181185
run_folder_name = "_".join(folder_name_parts)
182186
experiment_folder = base_output_folder / run_folder_name # e.g., 'outputs/bsa/greedy_c0.9_mo4_ts10'
@@ -186,14 +190,15 @@ def run_pipeline(args):
186190
scaffolds_folder = experiment_folder / "scaffolds"
187191
scaffolds_fasta_path = scaffolds_folder / "scaffolds.fasta"
188192

193+
statistics_folder = scaffolds_folder / "statistics"
194+
189195
clustering_folder = scaffolds_folder / "clustering" # Clustering output
190196
cluster_fasta_folder = clustering_folder / "cluster_fasta" # Input for alignment
191197

192198
alignment_folder = scaffolds_folder / "alignment"
193199

194200
consensus_folder = scaffolds_folder / "consensus"
195201

196-
# ID for logs (optional)
197202
run_id_str = f"[{run_name} @ {run_folder_name}]"
198203

199204
logger.info(f"Starting pipeline for run: {run_id_str}")
@@ -214,6 +219,40 @@ def run_pipeline(args):
214219
except Exception as e:
215220
logger.error(f"Preprocessing failed: {e}")
216221
return
222+
223+
if args.reference:
224+
logger.info("--- [Statistics] Computing INPUT Statistics ---")
225+
try:
226+
meta = helpers.get_sample_metadata(run=run_name, chain=args.chain, json_path=args.metadata_json_path)
227+
protein_seq = meta.get("protein", "")
228+
229+
if protein_seq:
230+
protein_norm = preprocessing.normalize_sequence(protein_seq)
231+
statistics_folder.mkdir(parents=True, exist_ok=True)
232+
233+
if cleaned_csv_path.exists():
234+
df_input = pd.read_csv(cleaned_csv_path)
235+
if "cleaned_preds" in df_input.columns:
236+
input_seqs = df_input["cleaned_preds"].dropna().unique().tolist()
237+
238+
mapped_contigs = viz.process_protein_contigs_scaffold(
239+
assembled_contigs=input_seqs,
240+
target_protein=protein_norm,
241+
max_mismatches=args.max_mismatches,
242+
min_identity=args.min_identity
243+
)
244+
df_contigs_mapped = viz.create_dataframe_from_mapped_sequences(data=mapped_contigs)
245+
246+
if not df_contigs_mapped.empty:
247+
helpers.compute_assembly_statistics(
248+
df=df_contigs_mapped,
249+
sequence_type="peptide",
250+
output_folder=str(statistics_folder),
251+
reference=protein_norm,
252+
)
253+
logger.info(f"[Statistics] Saved: peptide_stats.json")
254+
except Exception as e:
255+
logger.error(f"[Statistics] Failed to compute peptide stats: {e}")
217256

218257
try:
219258
logger.info("--- [Step 2/5] Running Assembly ---")

0 commit comments

Comments
 (0)