Multiomics-Analytics-Group
diff --git a/‎.github/workflows/cdci.yml‎
Lines changed: 1 addition & 3 deletions b/‎.github/workflows/cdci.yml‎
Lines changed: 1 addition & 3 deletions
diff --git a/‎environment.osx-arm64.yaml‎
Lines changed: 1 addition & 0 deletions b/‎environment.osx-arm64.yaml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 3 additions & 2 deletions b/‎pyproject.toml‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎scripts/fdr_analysis.py‎
Lines changed: 22 additions & 42 deletions b/‎scripts/fdr_analysis.py‎
Lines changed: 22 additions & 42 deletions
diff --git a/‎scripts/gridsearch.py‎
Lines changed: 4 additions & 9 deletions b/‎scripts/gridsearch.py‎
Lines changed: 4 additions & 9 deletions
diff --git a/‎scripts/model_peptide_selector.py‎
Lines changed: 12 additions & 31 deletions b/‎scripts/model_peptide_selector.py‎
Lines changed: 12 additions & 31 deletions
@@ -2,9 +2,7 @@ name: Python application
 
 on:
   push:
-    branches: ["main"]
   pull_request:
-    branches: ["main"]
 permissions:
   contents: read
 
@@ -30,4 +28,4 @@ jobs:
       - name: Lint with ruff
         run: |
           # stop the build if there are Python syntax errors or undefined names
-          ruff check src unittests
+          ruff check .
@@ -38,6 +38,7 @@ dependencies:
   - pip
   - pip:
       - build
+      - isort
       - twine
       - hatchling
       - wheel
 
@@ -67,14 +67,15 @@ where = ["src"]
 [tool.ruff]
 # Exclude notebooks to avoid linting errors on them
 exclude = ["*.ipynb", "docs/source/tutorials/*"]
-line-length = 88
+line-length = 120
 
 [tool.ruff.lint]
 # Enable common rules: E (pycodestyle errors), W (warnings), F (pyflakes), B (bugbear)
 extend-select = ["E", "W", "F", "B"]
+ignore = ["E501"]
 
 [tool.black]
-line-length = 88
+line-length = 120
 target-version = ['py311']
 
 [tool.isort]
 
@@ -20,19 +20,19 @@
 __status__ = Dev
 """
 
-import matplotlib.pyplot as plt
-import matplotlib.lines as mlines
-import pandas as pd
-import seaborn as sns
+import json
+import logging
 import os
 import sys
-import logging
-import json
 from pathlib import Path
-from Bio import SeqIO
 
+import matplotlib.lines as mlines
+import matplotlib.pyplot as plt
+import pandas as pd
+import seaborn as sns
+
+from instanexus import helpers, preprocessing, visualization
 from instanexus.assembly import Assembler
-from instanexus import visualization, helpers, preprocessing
 
 SCRIPT_DIR = Path(__file__).resolve().parent
 PROJECT_ROOT = SCRIPT_DIR.parent
@@ -95,17 +95,13 @@ def add_quantification_data(df_main, run_name, inputs_folder):
         valid_peps = set(df_main["cleaned_preds"].unique())
         df_quant = df_quant[df_quant["cleaned_preds"].isin(valid_peps)]
 
-        df_sum = df_quant.groupby("cleaned_preds", as_index=False)[
-            "total_abundance_norm"
-        ].sum()
-        df_sum.rename(
-            columns={"total_abundance_norm": "peptide_abundance"}, inplace=True
-        )
+        df_sum = df_quant.groupby("cleaned_preds", as_index=False)["total_abundance_norm"].sum()
+        df_sum.rename(columns={"total_abundance_norm": "peptide_abundance"}, inplace=True)
 
         df_merged = pd.merge(df_main, df_sum, on="cleaned_preds", how="left")
         df_merged["peptide_abundance"] = df_merged["peptide_abundance"].fillna(0)
         return df_merged
-    except:
+    except Exception:
         return df_main
 
 
@@ -121,7 +117,7 @@ def load_custom_palette():
             color = colors_data.get(json_key, {}).get("scaffold", "#333333")
             custom_palette[category_label] = color
         return custom_palette
-    except:
+    except Exception:
         return default_palette
 
 
@@ -140,9 +136,7 @@ def main():
     all_results = []
 
     for category, file_list in SAMPLE_GROUPS.items():
-        logger.info(
-            f"=== Processing Category: {category} ({len(file_list)} samples) ==="
-        )
+        logger.info(f"=== Processing Category: {category} ({len(file_list)} samples) ===")
 
         for filename in file_list:
             csv_path = INPUTS_FOLDER / filename
@@ -156,9 +150,7 @@ def main():
 
             try:
                 clean_run_name = run_name.replace("_cleaned", "")
-                meta = preprocessing.get_sample_metadata(
-                    clean_run_name, json_path=METADATA_JSON
-                )
+                meta = preprocessing.get_sample_metadata(clean_run_name, json_path=METADATA_JSON)
                 protein_norm = preprocessing.normalize_sequence(meta.get("protein", ""))
                 proteases = meta.get("proteases", [])
             except Exception as e:
@@ -169,27 +161,21 @@ def main():
 
             if "experiment_name" in df.columns:
                 df["protease"] = df["experiment_name"].apply(
-                    lambda x: preprocessing.extract_protease(x, proteases)
+                    lambda x, p=proteases: preprocessing.extract_protease(x, p)
                 )
 
             df = preprocessing.clean_dataframe(df)
 
             if "cleaned_preds" in df.columns:
-                df["cleaned_preds"] = df["cleaned_preds"].apply(
-                    preprocessing.remove_modifications
-                )
+                df["cleaned_preds"] = df["cleaned_preds"].apply(preprocessing.remove_modifications)
                 df = df.dropna(subset=["cleaned_preds"])
             else:
                 continue
 
-            df = add_quantification_data(
-                df, clean_run_name, inputs_folder=INPUTS_FOLDER
-            )
+            df = add_quantification_data(df, clean_run_name, inputs_folder=INPUTS_FOLDER)
 
             clean_list = df["cleaned_preds"].tolist()
-            filtered = preprocessing.filter_contaminants(
-                clean_list, clean_run_name, CONTAMINANTS_FASTA
-            )
+            filtered = preprocessing.filter_contaminants(clean_list, clean_run_name, CONTAMINANTS_FASTA)
             df = df[df["cleaned_preds"].isin(filtered)]
 
             for fdr in FDR_THRESHOLDS:
@@ -235,9 +221,7 @@ def main():
 
                 cov = 0
                 if mapped:
-                    df_map = visualization.create_dataframe_from_mapped_sequences(
-                        mapped
-                    )
+                    df_map = visualization.create_dataframe_from_mapped_sequences(mapped)
                     stats = helpers.compute_assembly_statistics(
                         df=df_map,
                         sequence_type="temp",
@@ -290,9 +274,7 @@ def main():
     )
 
     g.fig.subplots_adjust(top=0.82, wspace=0.3, hspace=0.4)
-    g.fig.suptitle(
-        f"Aggregated assembly performance (Mean ± 95% CI)", fontsize=16, y=0.98
-    )
+    g.fig.suptitle("Aggregated assembly performance (Mean ± 95% CI)", fontsize=16, y=0.98)
 
     legend_handles = []
     for cat in SAMPLE_GROUPS.keys():
@@ -322,14 +304,12 @@ def main():
     g.set(xticks=FDR_THRESHOLDS)
 
     for ax in g.axes.flat:
-        ax.set_xticklabels([f"{int(x*100)}%" for x in FDR_THRESHOLDS])
+        ax.set_xticklabels([f"{int(x * 100)}%" for x in FDR_THRESHOLDS])
 
     g.fig.subplots_adjust(top=0.82, wspace=0.3, hspace=0.4)
 
     plt.savefig(mode_output / "aggregated_coverage_faceted.svg", bbox_inches="tight")
-    plt.savefig(
-        mode_output / "aggregated_coverage_faceted.png", dpi=300, bbox_inches="tight"
-    )
+    plt.savefig(mode_output / "aggregated_coverage_faceted.png", dpi=300, bbox_inches="tight")
 
     logger.info(f"Aggregated plots saved to: {mode_output}")
 
 
@@ -45,9 +45,9 @@
 
 selected_grid = all_grids[method]
 
-keys, values = zip(*selected_grid.items())
+keys, values = zip(*selected_grid.items(), strict=False)
 
-combinations = [dict(zip(keys, v)) for v in itertools.product(*values)]
+combinations = [dict(zip(keys, v, strict=False)) for v in itertools.product(*values)]
 total_combinations = len(combinations)
 
 os.makedirs("logs", exist_ok=True)
@@ -60,9 +60,7 @@
     handlers=handlers,
 )
 
-logging.info(
-    f"Starting hyperparameter optimization with {total_combinations} combinations."
-)
+logging.info(f"Starting hyperparameter optimization with {total_combinations} combinations.")
 print(f"Total combinations: {total_combinations}")
 
 
@@ -83,10 +81,7 @@ def run_analysis(params, iteration):
 def grid_search_parallel():
     """Perform hyperparameter optimization in parallel."""
     with ProcessPoolExecutor(max_workers=64) as executor:
-        futures = {
-            executor.submit(run_analysis, params, idx + 1): idx + 1
-            for idx, params in enumerate(combinations)
-        }
+        futures = {executor.submit(run_analysis, params, idx + 1): idx + 1 for idx, params in enumerate(combinations)}
 
         for _ in tqdm(as_completed(futures), total=len(futures), desc="Processing"):
             pass
 
@@ -17,7 +17,6 @@
 __status__ = Dev
 """
 
-
 import json
 import re
 from math import log2
@@ -242,9 +241,7 @@ def load_aa_properties(json_path):
 def peptide_props(seq, aa_properties):
     """Calculate hydrophobicity, mass stats, and basic residue fraction."""
     if not seq or not isinstance(seq, str) or len(seq) == 0:
-        return pd.Series(
-            {"mean_hydro": 0, "mean_mass": 0, "mass_std": 0, "frac_basic": 0}
-        )
+        return pd.Series({"mean_hydro": 0, "mean_mass": 0, "mass_std": 0, "frac_basic": 0})
 
     vals_h = [aa_properties.get(a, {"hydro": 0})["hydro"] for a in seq]
     vals_m = [aa_properties.get(a, {"mass": 0})["mass"] for a in seq]
@@ -265,9 +262,7 @@ def build_reference_free_features(df, aa_properties, protease_rules):
 
     df = df.copy()
     df["seq_length"] = df["cleaned_preds"].str.len()
-    df["has_special"] = (
-        df["cleaned_preds"].str.contains(r"[^A-Z]", regex=True).astype(int)
-    )
+    df["has_special"] = df["cleaned_preds"].str.contains(r"[^A-Z]", regex=True).astype(int)
     df["first_aa"] = df["cleaned_preds"].str[0].astype("category").cat.codes
     df["last_aa"] = df["cleaned_preds"].str[-1].astype("category").cat.codes
 
@@ -289,40 +284,34 @@ def build_reference_free_features(df, aa_properties, protease_rules):
 
     df["cterm_matches_protease"] = [
         cterm_matches_any(s, p, protease_rules)
-        for s, p in zip(df["cleaned_preds"].fillna(""), prots_list)
+        for s, p in zip(df["cleaned_preds"].fillna(""), prots_list, strict=False)
     ]
     df["nterm_matches_protease"] = [
         nterm_matches_any(s, p, protease_rules)
-        for s, p in zip(df["cleaned_preds"].fillna(""), prots_list)
+        for s, p in zip(df["cleaned_preds"].fillna(""), prots_list, strict=False)
     ]
     df["internal_expected_sites_min"] = [
         internal_expected_sites_min(s, p, protease_rules)
-        for s, p in zip(df["cleaned_preds"].fillna(""), prots_list)
+        for s, p in zip(df["cleaned_preds"].fillna(""), prots_list, strict=False)
     ]
 
-    df["proline_block_at_cterm"] = (
-        df["cleaned_preds"].fillna("").apply(proline_block_at_cterm)
-    )
+    df["proline_block_at_cterm"] = df["cleaned_preds"].fillna("").apply(proline_block_at_cterm)
     df["protease"] = df["protease"].astype("category").cat.codes
 
     return df
 
 
 def train_model(df, reference_seq, model_path, aa_properties, protease_rules):
     """Train Random Forest classifier and save model with optimal threshold."""
-    df["mapped"] = df["cleaned_preds"].apply(
-        lambda x: int(isinstance(x, str) and x in reference_seq)
-    )
+    df["mapped"] = df["cleaned_preds"].apply(lambda x: int(isinstance(x, str) and x in reference_seq))
     df = build_reference_free_features(df, aa_properties, protease_rules)
 
     exclude = ["experiment_name", "scan_number", "preds", "cleaned_preds"]
     feature_cols = [c for c in df.columns if c not in exclude and c != "mapped"]
 
     x = df[feature_cols]
     y = df["mapped"].astype(int)
-    x_train, x_test, y_train, y_test = train_test_split(
-        x, y, test_size=0.3, stratify=y, random_state=42
-    )
+    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, stratify=y, random_state=42)
 
     model = RandomForestClassifier(n_estimators=500, random_state=42, n_jobs=-1)
     model.fit(x_train, y_train)
@@ -364,9 +353,7 @@ def plot_precision_recall(metrics, output_dir, filename="precision_recall_curve.
     best_idx = metrics["best_idx"]
     ap = metrics["ap"]
 
-    sns.lineplot(
-        x=recall, y=precision, color="#2E86AB", linewidth=1, label=f"AP = {ap:.2f}"
-    )
+    sns.lineplot(x=recall, y=precision, color="#2E86AB", linewidth=1, label=f"AP = {ap:.2f}")
     plt.scatter(
         recall[best_idx],
         precision[best_idx],
@@ -439,25 +426,19 @@ def main():
     protein_norm = prep.normalize_sequence(protein)
     df = pd.read_csv(INPUT_DIR / f"{run}.csv")
 
-    df["protease"] = df["experiment_name"].apply(
-        lambda name: prep.extract_protease(name, proteases)
-    )
+    df["protease"] = df["experiment_name"].apply(lambda name: prep.extract_protease(name, proteases))
 
     df = prep.clean_dataframe(df)
 
     df["cleaned_preds"] = df["preds"].apply(prep.remove_modifications)
 
     cleaned_psms = df["cleaned_preds"].tolist()
 
-    filtered_psms = prep.filter_contaminants(
-        cleaned_psms, run, FASTA_DIR / "contaminants.fasta"
-    )
+    filtered_psms = prep.filter_contaminants(cleaned_psms, run, FASTA_DIR / "contaminants.fasta")
 
     df = df[df["cleaned_preds"].isin(filtered_psms)]
 
-    df["mapped"] = df["cleaned_preds"].apply(
-        lambda x: int(isinstance(x, str) and x in protein_norm)
-    )
+    df["mapped"] = df["cleaned_preds"].apply(lambda x: int(isinstance(x, str) and x in protein_norm))
 
     model_path = BASE_DIR / "peptide_selector.pkl"
     metrics = train_model(df, protein, model_path, aa_props, protease_rules)
-Original file line number
+Diff line change
   - pip
   - pip:
       - build
 +      - isort
       - twine
       - hatchling
       - wheel