Multiomics-Analytics-Group
diff --git a/‎scripts/fdr_analysis.py‎
Lines changed: 12 additions & 32 deletions b/‎scripts/fdr_analysis.py‎
Lines changed: 12 additions & 32 deletions
diff --git a/‎scripts/gridsearch.py‎
Lines changed: 2 additions & 7 deletions b/‎scripts/gridsearch.py‎
Lines changed: 2 additions & 7 deletions
diff --git a/‎scripts/model_peptide_selector.py‎
Lines changed: 9 additions & 27 deletions b/‎scripts/model_peptide_selector.py‎
Lines changed: 9 additions & 27 deletions
diff --git a/‎scripts/opt_dbg.py‎
Lines changed: 9 additions & 27 deletions b/‎scripts/opt_dbg.py‎
Lines changed: 9 additions & 27 deletions
@@ -94,17 +94,13 @@ def add_quantification_data(df_main, run_name, inputs_folder):
         valid_peps = set(df_main["cleaned_preds"].unique())
         df_quant = df_quant[df_quant["cleaned_preds"].isin(valid_peps)]
 
-        df_sum = df_quant.groupby("cleaned_preds", as_index=False)[
-            "total_abundance_norm"
-        ].sum()
-        df_sum.rename(
-            columns={"total_abundance_norm": "peptide_abundance"}, inplace=True
-        )
+        df_sum = df_quant.groupby("cleaned_preds", as_index=False)["total_abundance_norm"].sum()
+        df_sum.rename(columns={"total_abundance_norm": "peptide_abundance"}, inplace=True)
 
         df_merged = pd.merge(df_main, df_sum, on="cleaned_preds", how="left")
         df_merged["peptide_abundance"] = df_merged["peptide_abundance"].fillna(0)
         return df_merged
-    except:
+    except Exception:
         return df_main
 
 
@@ -139,9 +135,7 @@ def main():
     all_results = []
 
     for category, file_list in SAMPLE_GROUPS.items():
-        logger.info(
-            f"=== Processing Category: {category} ({len(file_list)} samples) ==="
-        )
+        logger.info(f"=== Processing Category: {category} ({len(file_list)} samples) ===")
 
         for filename in file_list:
             csv_path = INPUTS_FOLDER / filename
@@ -155,9 +149,7 @@ def main():
 
             try:
                 clean_run_name = run_name.replace("_cleaned", "")
-                meta = preprocessing.get_sample_metadata(
-                    clean_run_name, json_path=METADATA_JSON
-                )
+                meta = preprocessing.get_sample_metadata(clean_run_name, json_path=METADATA_JSON)
                 protein_norm = preprocessing.normalize_sequence(meta.get("protein", ""))
                 proteases = meta.get("proteases", [])
             except Exception as e:
@@ -168,27 +160,21 @@ def main():
 
             if "experiment_name" in df.columns:
                 df["protease"] = df["experiment_name"].apply(
-                    lambda x: preprocessing.extract_protease(x, proteases)
+                    lambda x, p=proteases: preprocessing.extract_protease(x, p)
                 )
 
             df = preprocessing.clean_dataframe(df)
 
             if "cleaned_preds" in df.columns:
-                df["cleaned_preds"] = df["cleaned_preds"].apply(
-                    preprocessing.remove_modifications
-                )
+                df["cleaned_preds"] = df["cleaned_preds"].apply(preprocessing.remove_modifications)
                 df = df.dropna(subset=["cleaned_preds"])
             else:
                 continue
 
-            df = add_quantification_data(
-                df, clean_run_name, inputs_folder=INPUTS_FOLDER
-            )
+            df = add_quantification_data(df, clean_run_name, inputs_folder=INPUTS_FOLDER)
 
             clean_list = df["cleaned_preds"].tolist()
-            filtered = preprocessing.filter_contaminants(
-                clean_list, clean_run_name, CONTAMINANTS_FASTA
-            )
+            filtered = preprocessing.filter_contaminants(clean_list, clean_run_name, CONTAMINANTS_FASTA)
             df = df[df["cleaned_preds"].isin(filtered)]
 
             for fdr in FDR_THRESHOLDS:
@@ -234,9 +220,7 @@ def main():
 
                 cov = 0
                 if mapped:
-                    df_map = visualization.create_dataframe_from_mapped_sequences(
-                        mapped
-                    )
+                    df_map = visualization.create_dataframe_from_mapped_sequences(mapped)
                     stats = helpers.compute_assembly_statistics(
                         df=df_map,
                         sequence_type="temp",
@@ -289,9 +273,7 @@ def main():
     )
 
     g.fig.subplots_adjust(top=0.82, wspace=0.3, hspace=0.4)
-    g.fig.suptitle(
-        "Aggregated assembly performance (Mean ± 95% CI)", fontsize=16, y=0.98
-    )
+    g.fig.suptitle("Aggregated assembly performance (Mean ± 95% CI)", fontsize=16, y=0.98)
 
     legend_handles = []
     for cat in SAMPLE_GROUPS.keys():
@@ -326,9 +308,7 @@ def main():
     g.fig.subplots_adjust(top=0.82, wspace=0.3, hspace=0.4)
 
     plt.savefig(mode_output / "aggregated_coverage_faceted.svg", bbox_inches="tight")
-    plt.savefig(
-        mode_output / "aggregated_coverage_faceted.png", dpi=300, bbox_inches="tight"
-    )
+    plt.savefig(mode_output / "aggregated_coverage_faceted.png", dpi=300, bbox_inches="tight")
 
     logger.info(f"Aggregated plots saved to: {mode_output}")
 
 
@@ -60,9 +60,7 @@
     handlers=handlers,
 )
 
-logging.info(
-    f"Starting hyperparameter optimization with {total_combinations} combinations."
-)
+logging.info(f"Starting hyperparameter optimization with {total_combinations} combinations.")
 print(f"Total combinations: {total_combinations}")
 
 
@@ -83,10 +81,7 @@ def run_analysis(params, iteration):
 def grid_search_parallel():
     """Perform hyperparameter optimization in parallel."""
     with ProcessPoolExecutor(max_workers=64) as executor:
-        futures = {
-            executor.submit(run_analysis, params, idx + 1): idx + 1
-            for idx, params in enumerate(combinations)
-        }
+        futures = {executor.submit(run_analysis, params, idx + 1): idx + 1 for idx, params in enumerate(combinations)}
 
         for _ in tqdm(as_completed(futures), total=len(futures), desc="Processing"):
             pass
 
@@ -241,9 +241,7 @@ def load_aa_properties(json_path):
 def peptide_props(seq, aa_properties):
     """Calculate hydrophobicity, mass stats, and basic residue fraction."""
     if not seq or not isinstance(seq, str) or len(seq) == 0:
-        return pd.Series(
-            {"mean_hydro": 0, "mean_mass": 0, "mass_std": 0, "frac_basic": 0}
-        )
+        return pd.Series({"mean_hydro": 0, "mean_mass": 0, "mass_std": 0, "frac_basic": 0})
 
     vals_h = [aa_properties.get(a, {"hydro": 0})["hydro"] for a in seq]
     vals_m = [aa_properties.get(a, {"mass": 0})["mass"] for a in seq]
@@ -264,9 +262,7 @@ def build_reference_free_features(df, aa_properties, protease_rules):
 
     df = df.copy()
     df["seq_length"] = df["cleaned_preds"].str.len()
-    df["has_special"] = (
-        df["cleaned_preds"].str.contains(r"[^A-Z]", regex=True).astype(int)
-    )
+    df["has_special"] = df["cleaned_preds"].str.contains(r"[^A-Z]", regex=True).astype(int)
     df["first_aa"] = df["cleaned_preds"].str[0].astype("category").cat.codes
     df["last_aa"] = df["cleaned_preds"].str[-1].astype("category").cat.codes
 
@@ -299,29 +295,23 @@ def build_reference_free_features(df, aa_properties, protease_rules):
         for s, p in zip(df["cleaned_preds"].fillna(""), prots_list, strict=False)
     ]
 
-    df["proline_block_at_cterm"] = (
-        df["cleaned_preds"].fillna("").apply(proline_block_at_cterm)
-    )
+    df["proline_block_at_cterm"] = df["cleaned_preds"].fillna("").apply(proline_block_at_cterm)
     df["protease"] = df["protease"].astype("category").cat.codes
 
     return df
 
 
 def train_model(df, reference_seq, model_path, aa_properties, protease_rules):
     """Train Random Forest classifier and save model with optimal threshold."""
-    df["mapped"] = df["cleaned_preds"].apply(
-        lambda x: int(isinstance(x, str) and x in reference_seq)
-    )
+    df["mapped"] = df["cleaned_preds"].apply(lambda x: int(isinstance(x, str) and x in reference_seq))
     df = build_reference_free_features(df, aa_properties, protease_rules)
 
     exclude = ["experiment_name", "scan_number", "preds", "cleaned_preds"]
     feature_cols = [c for c in df.columns if c not in exclude and c != "mapped"]
 
     x = df[feature_cols]
     y = df["mapped"].astype(int)
-    x_train, x_test, y_train, y_test = train_test_split(
-        x, y, test_size=0.3, stratify=y, random_state=42
-    )
+    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, stratify=y, random_state=42)
 
     model = RandomForestClassifier(n_estimators=500, random_state=42, n_jobs=-1)
     model.fit(x_train, y_train)
@@ -363,9 +353,7 @@ def plot_precision_recall(metrics, output_dir, filename="precision_recall_curve.
     best_idx = metrics["best_idx"]
     ap = metrics["ap"]
 
-    sns.lineplot(
-        x=recall, y=precision, color="#2E86AB", linewidth=1, label=f"AP = {ap:.2f}"
-    )
+    sns.lineplot(x=recall, y=precision, color="#2E86AB", linewidth=1, label=f"AP = {ap:.2f}")
     plt.scatter(
         recall[best_idx],
         precision[best_idx],
@@ -438,25 +426,19 @@ def main():
     protein_norm = prep.normalize_sequence(protein)
     df = pd.read_csv(INPUT_DIR / f"{run}.csv")
 
-    df["protease"] = df["experiment_name"].apply(
-        lambda name: prep.extract_protease(name, proteases)
-    )
+    df["protease"] = df["experiment_name"].apply(lambda name: prep.extract_protease(name, proteases))
 
     df = prep.clean_dataframe(df)
 
     df["cleaned_preds"] = df["preds"].apply(prep.remove_modifications)
 
     cleaned_psms = df["cleaned_preds"].tolist()
 
-    filtered_psms = prep.filter_contaminants(
-        cleaned_psms, run, FASTA_DIR / "contaminants.fasta"
-    )
+    filtered_psms = prep.filter_contaminants(cleaned_psms, run, FASTA_DIR / "contaminants.fasta")
 
     df = df[df["cleaned_preds"].isin(filtered_psms)]
 
-    df["mapped"] = df["cleaned_preds"].apply(
-        lambda x: int(isinstance(x, str) and x in protein_norm)
-    )
+    df["mapped"] = df["cleaned_preds"].apply(lambda x: int(isinstance(x, str) and x in protein_norm))
 
     model_path = BASE_DIR / "peptide_selector.pkl"
     metrics = train_model(df, protein, model_path, aa_props, protease_rules)
 
@@ -56,9 +56,7 @@ def get_sample_metadata(run, chain="", json_path=JSON_DIR / "sample_metadata.jso
     raise ValueError(f"No metadata found for run '{run}' with chain '{chain}'.")
 
 
-def run_pipeline_dbg(
-    conf, kmer_size, min_overlap, max_mismatches, min_identity, size_threshold
-):
+def run_pipeline_dbg(conf, kmer_size, min_overlap, max_mismatches, min_identity, size_threshold):
     ass_method = "dbg"
     run = "ma1"
 
@@ -97,25 +95,19 @@ def run_pipeline_dbg(
 
     df = pd.read_csv(INPUT_DIR / f"{run}.csv")
 
-    df["protease"] = df["experiment_name"].apply(
-        lambda name: prep.extract_protease(name, proteases)
-    )
+    df["protease"] = df["experiment_name"].apply(lambda name: prep.extract_protease(name, proteases))
 
     df = prep.clean_dataframe(df)
 
     df["cleaned_preds"] = df["preds"].apply(prep.remove_modifications)
 
     cleaned_psms = df["cleaned_preds"].tolist()
 
-    filtered_psms = prep.filter_contaminants(
-        cleaned_psms, run, FASTA_DIR / "contaminants.fasta"
-    )
+    filtered_psms = prep.filter_contaminants(cleaned_psms, run, FASTA_DIR / "contaminants.fasta")
 
     df = df[df["cleaned_preds"].isin(filtered_psms)]
 
-    df["mapped"] = df["cleaned_preds"].apply(
-        lambda x: "True" if x in protein_norm else "False"
-    )
+    df["mapped"] = df["cleaned_preds"].apply(lambda x: "True" if x in protein_norm else "False")
 
     df = df[df["conf"] > conf]
 
@@ -153,9 +145,7 @@ def run_pipeline_dbg(
         "fasta",
     )
 
-    mapped_contigs = map.process_protein_contigs_scaffold(
-        assembled_contigs, protein_norm, max_mismatches, min_identity
-    )
+    mapped_contigs = map.process_protein_contigs_scaffold(assembled_contigs, protein_norm, max_mismatches, min_identity)
 
     df_contigs = map.create_dataframe_from_mapped_sequences(data=mapped_contigs)
 
@@ -173,25 +163,19 @@ def run_pipeline_dbg(
 
     assembled_scaffolds = sorted(assembled_scaffolds, key=len, reverse=True)
 
-    assembled_scaffolds = [
-        scaffold for scaffold in assembled_scaffolds if len(scaffold) > size_threshold
-    ]
+    assembled_scaffolds = [scaffold for scaffold in assembled_scaffolds if len(scaffold) > size_threshold]
 
     assembled_scaffolds = dbg.merge_sequences(assembled_scaffolds)
 
     assembled_scaffolds = list(set(assembled_scaffolds))
 
     assembled_scaffolds = sorted(assembled_scaffolds, key=len, reverse=True)
 
-    assembled_scaffolds = [
-        scaffold for scaffold in assembled_scaffolds if len(scaffold) > size_threshold
-    ]
+    assembled_scaffolds = [scaffold for scaffold in assembled_scaffolds if len(scaffold) > size_threshold]
 
     records = []
     for i, seq in enumerate(assembled_scaffolds):
-        record = Bio.SeqRecord.SeqRecord(
-            Bio.Seq.Seq(seq), id=f"scaffold_{i + 1}", description=f"length: {len(seq)}"
-        )
+        record = Bio.SeqRecord.SeqRecord(Bio.Seq.Seq(seq), id=f"scaffold_{i + 1}", description=f"length: {len(seq)}")
         records.append(record)
 
     Bio.SeqIO.write(
@@ -207,9 +191,7 @@ def run_pipeline_dbg(
         min_identity=min_identity,
     )
 
-    df_scaffolds_mapped = map.create_dataframe_from_mapped_sequences(
-        data=mapped_scaffolds
-    )
+    df_scaffolds_mapped = map.create_dataframe_from_mapped_sequences(data=mapped_scaffolds)
 
     comp_stat.compute_assembly_statistics(
         df=df_scaffolds_mapped,