Skip to content

Commit 5830108

Browse files
Merge pull request #13 from Multiomics-Analytics-Group/fix/linting-cleanup
Fix/linting cleanup
2 parents f9760f9 + e6c31af commit 5830108

19 files changed

+244
-540
lines changed

.github/workflows/cdci.yml

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,7 @@ name: Python application
22

33
on:
44
push:
5-
branches: ["main"]
65
pull_request:
7-
branches: ["main"]
86
permissions:
97
contents: read
108

@@ -30,4 +28,4 @@ jobs:
3028
- name: Lint with ruff
3129
run: |
3230
# stop the build if there are Python syntax errors or undefined names
33-
ruff check src unittests
31+
ruff check .

environment.osx-arm64.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ dependencies:
3838
- pip
3939
- pip:
4040
- build
41+
- isort
4142
- twine
4243
- hatchling
4344
- wheel

pyproject.toml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -67,14 +67,15 @@ where = ["src"]
6767
[tool.ruff]
6868
# Exclude notebooks to avoid linting errors on them
6969
exclude = ["*.ipynb", "docs/source/tutorials/*"]
70-
line-length = 88
70+
line-length = 120
7171

7272
[tool.ruff.lint]
7373
# Enable common rules: E (pycodestyle errors), W (warnings), F (pyflakes), B (bugbear)
7474
extend-select = ["E", "W", "F", "B"]
75+
ignore = ["E501"]
7576

7677
[tool.black]
77-
line-length = 88
78+
line-length = 120
7879
target-version = ['py311']
7980

8081
[tool.isort]

scripts/fdr_analysis.py

Lines changed: 22 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -20,19 +20,19 @@
2020
__status__ = Dev
2121
"""
2222

23-
import matplotlib.pyplot as plt
24-
import matplotlib.lines as mlines
25-
import pandas as pd
26-
import seaborn as sns
23+
import json
24+
import logging
2725
import os
2826
import sys
29-
import logging
30-
import json
3127
from pathlib import Path
32-
from Bio import SeqIO
3328

29+
import matplotlib.lines as mlines
30+
import matplotlib.pyplot as plt
31+
import pandas as pd
32+
import seaborn as sns
33+
34+
from instanexus import helpers, preprocessing, visualization
3435
from instanexus.assembly import Assembler
35-
from instanexus import visualization, helpers, preprocessing
3636

3737
SCRIPT_DIR = Path(__file__).resolve().parent
3838
PROJECT_ROOT = SCRIPT_DIR.parent
@@ -95,17 +95,13 @@ def add_quantification_data(df_main, run_name, inputs_folder):
9595
valid_peps = set(df_main["cleaned_preds"].unique())
9696
df_quant = df_quant[df_quant["cleaned_preds"].isin(valid_peps)]
9797

98-
df_sum = df_quant.groupby("cleaned_preds", as_index=False)[
99-
"total_abundance_norm"
100-
].sum()
101-
df_sum.rename(
102-
columns={"total_abundance_norm": "peptide_abundance"}, inplace=True
103-
)
98+
df_sum = df_quant.groupby("cleaned_preds", as_index=False)["total_abundance_norm"].sum()
99+
df_sum.rename(columns={"total_abundance_norm": "peptide_abundance"}, inplace=True)
104100

105101
df_merged = pd.merge(df_main, df_sum, on="cleaned_preds", how="left")
106102
df_merged["peptide_abundance"] = df_merged["peptide_abundance"].fillna(0)
107103
return df_merged
108-
except:
104+
except Exception:
109105
return df_main
110106

111107

@@ -121,7 +117,7 @@ def load_custom_palette():
121117
color = colors_data.get(json_key, {}).get("scaffold", "#333333")
122118
custom_palette[category_label] = color
123119
return custom_palette
124-
except:
120+
except Exception:
125121
return default_palette
126122

127123

@@ -140,9 +136,7 @@ def main():
140136
all_results = []
141137

142138
for category, file_list in SAMPLE_GROUPS.items():
143-
logger.info(
144-
f"=== Processing Category: {category} ({len(file_list)} samples) ==="
145-
)
139+
logger.info(f"=== Processing Category: {category} ({len(file_list)} samples) ===")
146140

147141
for filename in file_list:
148142
csv_path = INPUTS_FOLDER / filename
@@ -156,9 +150,7 @@ def main():
156150

157151
try:
158152
clean_run_name = run_name.replace("_cleaned", "")
159-
meta = preprocessing.get_sample_metadata(
160-
clean_run_name, json_path=METADATA_JSON
161-
)
153+
meta = preprocessing.get_sample_metadata(clean_run_name, json_path=METADATA_JSON)
162154
protein_norm = preprocessing.normalize_sequence(meta.get("protein", ""))
163155
proteases = meta.get("proteases", [])
164156
except Exception as e:
@@ -169,27 +161,21 @@ def main():
169161

170162
if "experiment_name" in df.columns:
171163
df["protease"] = df["experiment_name"].apply(
172-
lambda x: preprocessing.extract_protease(x, proteases)
164+
lambda x, p=proteases: preprocessing.extract_protease(x, p)
173165
)
174166

175167
df = preprocessing.clean_dataframe(df)
176168

177169
if "cleaned_preds" in df.columns:
178-
df["cleaned_preds"] = df["cleaned_preds"].apply(
179-
preprocessing.remove_modifications
180-
)
170+
df["cleaned_preds"] = df["cleaned_preds"].apply(preprocessing.remove_modifications)
181171
df = df.dropna(subset=["cleaned_preds"])
182172
else:
183173
continue
184174

185-
df = add_quantification_data(
186-
df, clean_run_name, inputs_folder=INPUTS_FOLDER
187-
)
175+
df = add_quantification_data(df, clean_run_name, inputs_folder=INPUTS_FOLDER)
188176

189177
clean_list = df["cleaned_preds"].tolist()
190-
filtered = preprocessing.filter_contaminants(
191-
clean_list, clean_run_name, CONTAMINANTS_FASTA
192-
)
178+
filtered = preprocessing.filter_contaminants(clean_list, clean_run_name, CONTAMINANTS_FASTA)
193179
df = df[df["cleaned_preds"].isin(filtered)]
194180

195181
for fdr in FDR_THRESHOLDS:
@@ -235,9 +221,7 @@ def main():
235221

236222
cov = 0
237223
if mapped:
238-
df_map = visualization.create_dataframe_from_mapped_sequences(
239-
mapped
240-
)
224+
df_map = visualization.create_dataframe_from_mapped_sequences(mapped)
241225
stats = helpers.compute_assembly_statistics(
242226
df=df_map,
243227
sequence_type="temp",
@@ -290,9 +274,7 @@ def main():
290274
)
291275

292276
g.fig.subplots_adjust(top=0.82, wspace=0.3, hspace=0.4)
293-
g.fig.suptitle(
294-
f"Aggregated assembly performance (Mean ± 95% CI)", fontsize=16, y=0.98
295-
)
277+
g.fig.suptitle("Aggregated assembly performance (Mean ± 95% CI)", fontsize=16, y=0.98)
296278

297279
legend_handles = []
298280
for cat in SAMPLE_GROUPS.keys():
@@ -322,14 +304,12 @@ def main():
322304
g.set(xticks=FDR_THRESHOLDS)
323305

324306
for ax in g.axes.flat:
325-
ax.set_xticklabels([f"{int(x*100)}%" for x in FDR_THRESHOLDS])
307+
ax.set_xticklabels([f"{int(x * 100)}%" for x in FDR_THRESHOLDS])
326308

327309
g.fig.subplots_adjust(top=0.82, wspace=0.3, hspace=0.4)
328310

329311
plt.savefig(mode_output / "aggregated_coverage_faceted.svg", bbox_inches="tight")
330-
plt.savefig(
331-
mode_output / "aggregated_coverage_faceted.png", dpi=300, bbox_inches="tight"
332-
)
312+
plt.savefig(mode_output / "aggregated_coverage_faceted.png", dpi=300, bbox_inches="tight")
333313

334314
logger.info(f"Aggregated plots saved to: {mode_output}")
335315

scripts/gridsearch.py

Lines changed: 4 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -45,9 +45,9 @@
4545

4646
selected_grid = all_grids[method]
4747

48-
keys, values = zip(*selected_grid.items())
48+
keys, values = zip(*selected_grid.items(), strict=False)
4949

50-
combinations = [dict(zip(keys, v)) for v in itertools.product(*values)]
50+
combinations = [dict(zip(keys, v, strict=False)) for v in itertools.product(*values)]
5151
total_combinations = len(combinations)
5252

5353
os.makedirs("logs", exist_ok=True)
@@ -60,9 +60,7 @@
6060
handlers=handlers,
6161
)
6262

63-
logging.info(
64-
f"Starting hyperparameter optimization with {total_combinations} combinations."
65-
)
63+
logging.info(f"Starting hyperparameter optimization with {total_combinations} combinations.")
6664
print(f"Total combinations: {total_combinations}")
6765

6866

@@ -83,10 +81,7 @@ def run_analysis(params, iteration):
8381
def grid_search_parallel():
8482
"""Perform hyperparameter optimization in parallel."""
8583
with ProcessPoolExecutor(max_workers=64) as executor:
86-
futures = {
87-
executor.submit(run_analysis, params, idx + 1): idx + 1
88-
for idx, params in enumerate(combinations)
89-
}
84+
futures = {executor.submit(run_analysis, params, idx + 1): idx + 1 for idx, params in enumerate(combinations)}
9085

9186
for _ in tqdm(as_completed(futures), total=len(futures), desc="Processing"):
9287
pass

scripts/model_peptide_selector.py

Lines changed: 12 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@
1717
__status__ = Dev
1818
"""
1919

20-
2120
import json
2221
import re
2322
from math import log2
@@ -242,9 +241,7 @@ def load_aa_properties(json_path):
242241
def peptide_props(seq, aa_properties):
243242
"""Calculate hydrophobicity, mass stats, and basic residue fraction."""
244243
if not seq or not isinstance(seq, str) or len(seq) == 0:
245-
return pd.Series(
246-
{"mean_hydro": 0, "mean_mass": 0, "mass_std": 0, "frac_basic": 0}
247-
)
244+
return pd.Series({"mean_hydro": 0, "mean_mass": 0, "mass_std": 0, "frac_basic": 0})
248245

249246
vals_h = [aa_properties.get(a, {"hydro": 0})["hydro"] for a in seq]
250247
vals_m = [aa_properties.get(a, {"mass": 0})["mass"] for a in seq]
@@ -265,9 +262,7 @@ def build_reference_free_features(df, aa_properties, protease_rules):
265262

266263
df = df.copy()
267264
df["seq_length"] = df["cleaned_preds"].str.len()
268-
df["has_special"] = (
269-
df["cleaned_preds"].str.contains(r"[^A-Z]", regex=True).astype(int)
270-
)
265+
df["has_special"] = df["cleaned_preds"].str.contains(r"[^A-Z]", regex=True).astype(int)
271266
df["first_aa"] = df["cleaned_preds"].str[0].astype("category").cat.codes
272267
df["last_aa"] = df["cleaned_preds"].str[-1].astype("category").cat.codes
273268

@@ -289,40 +284,34 @@ def build_reference_free_features(df, aa_properties, protease_rules):
289284

290285
df["cterm_matches_protease"] = [
291286
cterm_matches_any(s, p, protease_rules)
292-
for s, p in zip(df["cleaned_preds"].fillna(""), prots_list)
287+
for s, p in zip(df["cleaned_preds"].fillna(""), prots_list, strict=False)
293288
]
294289
df["nterm_matches_protease"] = [
295290
nterm_matches_any(s, p, protease_rules)
296-
for s, p in zip(df["cleaned_preds"].fillna(""), prots_list)
291+
for s, p in zip(df["cleaned_preds"].fillna(""), prots_list, strict=False)
297292
]
298293
df["internal_expected_sites_min"] = [
299294
internal_expected_sites_min(s, p, protease_rules)
300-
for s, p in zip(df["cleaned_preds"].fillna(""), prots_list)
295+
for s, p in zip(df["cleaned_preds"].fillna(""), prots_list, strict=False)
301296
]
302297

303-
df["proline_block_at_cterm"] = (
304-
df["cleaned_preds"].fillna("").apply(proline_block_at_cterm)
305-
)
298+
df["proline_block_at_cterm"] = df["cleaned_preds"].fillna("").apply(proline_block_at_cterm)
306299
df["protease"] = df["protease"].astype("category").cat.codes
307300

308301
return df
309302

310303

311304
def train_model(df, reference_seq, model_path, aa_properties, protease_rules):
312305
"""Train Random Forest classifier and save model with optimal threshold."""
313-
df["mapped"] = df["cleaned_preds"].apply(
314-
lambda x: int(isinstance(x, str) and x in reference_seq)
315-
)
306+
df["mapped"] = df["cleaned_preds"].apply(lambda x: int(isinstance(x, str) and x in reference_seq))
316307
df = build_reference_free_features(df, aa_properties, protease_rules)
317308

318309
exclude = ["experiment_name", "scan_number", "preds", "cleaned_preds"]
319310
feature_cols = [c for c in df.columns if c not in exclude and c != "mapped"]
320311

321312
x = df[feature_cols]
322313
y = df["mapped"].astype(int)
323-
x_train, x_test, y_train, y_test = train_test_split(
324-
x, y, test_size=0.3, stratify=y, random_state=42
325-
)
314+
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, stratify=y, random_state=42)
326315

327316
model = RandomForestClassifier(n_estimators=500, random_state=42, n_jobs=-1)
328317
model.fit(x_train, y_train)
@@ -364,9 +353,7 @@ def plot_precision_recall(metrics, output_dir, filename="precision_recall_curve.
364353
best_idx = metrics["best_idx"]
365354
ap = metrics["ap"]
366355

367-
sns.lineplot(
368-
x=recall, y=precision, color="#2E86AB", linewidth=1, label=f"AP = {ap:.2f}"
369-
)
356+
sns.lineplot(x=recall, y=precision, color="#2E86AB", linewidth=1, label=f"AP = {ap:.2f}")
370357
plt.scatter(
371358
recall[best_idx],
372359
precision[best_idx],
@@ -439,25 +426,19 @@ def main():
439426
protein_norm = prep.normalize_sequence(protein)
440427
df = pd.read_csv(INPUT_DIR / f"{run}.csv")
441428

442-
df["protease"] = df["experiment_name"].apply(
443-
lambda name: prep.extract_protease(name, proteases)
444-
)
429+
df["protease"] = df["experiment_name"].apply(lambda name: prep.extract_protease(name, proteases))
445430

446431
df = prep.clean_dataframe(df)
447432

448433
df["cleaned_preds"] = df["preds"].apply(prep.remove_modifications)
449434

450435
cleaned_psms = df["cleaned_preds"].tolist()
451436

452-
filtered_psms = prep.filter_contaminants(
453-
cleaned_psms, run, FASTA_DIR / "contaminants.fasta"
454-
)
437+
filtered_psms = prep.filter_contaminants(cleaned_psms, run, FASTA_DIR / "contaminants.fasta")
455438

456439
df = df[df["cleaned_preds"].isin(filtered_psms)]
457440

458-
df["mapped"] = df["cleaned_preds"].apply(
459-
lambda x: int(isinstance(x, str) and x in protein_norm)
460-
)
441+
df["mapped"] = df["cleaned_preds"].apply(lambda x: int(isinstance(x, str) and x in protein_norm))
461442

462443
model_path = BASE_DIR / "peptide_selector.pkl"
463444
metrics = train_model(df, protein, model_path, aa_props, protease_rules)

0 commit comments

Comments
 (0)