Skip to content

Commit ae2043c

Browse files
Fix: resolve bare excepts, loop variable binding and apply black formatting
1 parent aed7002 commit ae2043c

15 files changed

+142
-430
lines changed

scripts/fdr_analysis.py

Lines changed: 12 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -94,17 +94,13 @@ def add_quantification_data(df_main, run_name, inputs_folder):
9494
valid_peps = set(df_main["cleaned_preds"].unique())
9595
df_quant = df_quant[df_quant["cleaned_preds"].isin(valid_peps)]
9696

97-
df_sum = df_quant.groupby("cleaned_preds", as_index=False)[
98-
"total_abundance_norm"
99-
].sum()
100-
df_sum.rename(
101-
columns={"total_abundance_norm": "peptide_abundance"}, inplace=True
102-
)
97+
df_sum = df_quant.groupby("cleaned_preds", as_index=False)["total_abundance_norm"].sum()
98+
df_sum.rename(columns={"total_abundance_norm": "peptide_abundance"}, inplace=True)
10399

104100
df_merged = pd.merge(df_main, df_sum, on="cleaned_preds", how="left")
105101
df_merged["peptide_abundance"] = df_merged["peptide_abundance"].fillna(0)
106102
return df_merged
107-
except:
103+
except Exception:
108104
return df_main
109105

110106

@@ -139,9 +135,7 @@ def main():
139135
all_results = []
140136

141137
for category, file_list in SAMPLE_GROUPS.items():
142-
logger.info(
143-
f"=== Processing Category: {category} ({len(file_list)} samples) ==="
144-
)
138+
logger.info(f"=== Processing Category: {category} ({len(file_list)} samples) ===")
145139

146140
for filename in file_list:
147141
csv_path = INPUTS_FOLDER / filename
@@ -155,9 +149,7 @@ def main():
155149

156150
try:
157151
clean_run_name = run_name.replace("_cleaned", "")
158-
meta = preprocessing.get_sample_metadata(
159-
clean_run_name, json_path=METADATA_JSON
160-
)
152+
meta = preprocessing.get_sample_metadata(clean_run_name, json_path=METADATA_JSON)
161153
protein_norm = preprocessing.normalize_sequence(meta.get("protein", ""))
162154
proteases = meta.get("proteases", [])
163155
except Exception as e:
@@ -168,27 +160,21 @@ def main():
168160

169161
if "experiment_name" in df.columns:
170162
df["protease"] = df["experiment_name"].apply(
171-
lambda x: preprocessing.extract_protease(x, proteases)
163+
lambda x, p=proteases: preprocessing.extract_protease(x, p)
172164
)
173165

174166
df = preprocessing.clean_dataframe(df)
175167

176168
if "cleaned_preds" in df.columns:
177-
df["cleaned_preds"] = df["cleaned_preds"].apply(
178-
preprocessing.remove_modifications
179-
)
169+
df["cleaned_preds"] = df["cleaned_preds"].apply(preprocessing.remove_modifications)
180170
df = df.dropna(subset=["cleaned_preds"])
181171
else:
182172
continue
183173

184-
df = add_quantification_data(
185-
df, clean_run_name, inputs_folder=INPUTS_FOLDER
186-
)
174+
df = add_quantification_data(df, clean_run_name, inputs_folder=INPUTS_FOLDER)
187175

188176
clean_list = df["cleaned_preds"].tolist()
189-
filtered = preprocessing.filter_contaminants(
190-
clean_list, clean_run_name, CONTAMINANTS_FASTA
191-
)
177+
filtered = preprocessing.filter_contaminants(clean_list, clean_run_name, CONTAMINANTS_FASTA)
192178
df = df[df["cleaned_preds"].isin(filtered)]
193179

194180
for fdr in FDR_THRESHOLDS:
@@ -234,9 +220,7 @@ def main():
234220

235221
cov = 0
236222
if mapped:
237-
df_map = visualization.create_dataframe_from_mapped_sequences(
238-
mapped
239-
)
223+
df_map = visualization.create_dataframe_from_mapped_sequences(mapped)
240224
stats = helpers.compute_assembly_statistics(
241225
df=df_map,
242226
sequence_type="temp",
@@ -289,9 +273,7 @@ def main():
289273
)
290274

291275
g.fig.subplots_adjust(top=0.82, wspace=0.3, hspace=0.4)
292-
g.fig.suptitle(
293-
"Aggregated assembly performance (Mean ± 95% CI)", fontsize=16, y=0.98
294-
)
276+
g.fig.suptitle("Aggregated assembly performance (Mean ± 95% CI)", fontsize=16, y=0.98)
295277

296278
legend_handles = []
297279
for cat in SAMPLE_GROUPS.keys():
@@ -326,9 +308,7 @@ def main():
326308
g.fig.subplots_adjust(top=0.82, wspace=0.3, hspace=0.4)
327309

328310
plt.savefig(mode_output / "aggregated_coverage_faceted.svg", bbox_inches="tight")
329-
plt.savefig(
330-
mode_output / "aggregated_coverage_faceted.png", dpi=300, bbox_inches="tight"
331-
)
311+
plt.savefig(mode_output / "aggregated_coverage_faceted.png", dpi=300, bbox_inches="tight")
332312

333313
logger.info(f"Aggregated plots saved to: {mode_output}")
334314

scripts/gridsearch.py

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -60,9 +60,7 @@
6060
handlers=handlers,
6161
)
6262

63-
logging.info(
64-
f"Starting hyperparameter optimization with {total_combinations} combinations."
65-
)
63+
logging.info(f"Starting hyperparameter optimization with {total_combinations} combinations.")
6664
print(f"Total combinations: {total_combinations}")
6765

6866

@@ -83,10 +81,7 @@ def run_analysis(params, iteration):
8381
def grid_search_parallel():
8482
"""Perform hyperparameter optimization in parallel."""
8583
with ProcessPoolExecutor(max_workers=64) as executor:
86-
futures = {
87-
executor.submit(run_analysis, params, idx + 1): idx + 1
88-
for idx, params in enumerate(combinations)
89-
}
84+
futures = {executor.submit(run_analysis, params, idx + 1): idx + 1 for idx, params in enumerate(combinations)}
9085

9186
for _ in tqdm(as_completed(futures), total=len(futures), desc="Processing"):
9287
pass

scripts/model_peptide_selector.py

Lines changed: 9 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -241,9 +241,7 @@ def load_aa_properties(json_path):
241241
def peptide_props(seq, aa_properties):
242242
"""Calculate hydrophobicity, mass stats, and basic residue fraction."""
243243
if not seq or not isinstance(seq, str) or len(seq) == 0:
244-
return pd.Series(
245-
{"mean_hydro": 0, "mean_mass": 0, "mass_std": 0, "frac_basic": 0}
246-
)
244+
return pd.Series({"mean_hydro": 0, "mean_mass": 0, "mass_std": 0, "frac_basic": 0})
247245

248246
vals_h = [aa_properties.get(a, {"hydro": 0})["hydro"] for a in seq]
249247
vals_m = [aa_properties.get(a, {"mass": 0})["mass"] for a in seq]
@@ -264,9 +262,7 @@ def build_reference_free_features(df, aa_properties, protease_rules):
264262

265263
df = df.copy()
266264
df["seq_length"] = df["cleaned_preds"].str.len()
267-
df["has_special"] = (
268-
df["cleaned_preds"].str.contains(r"[^A-Z]", regex=True).astype(int)
269-
)
265+
df["has_special"] = df["cleaned_preds"].str.contains(r"[^A-Z]", regex=True).astype(int)
270266
df["first_aa"] = df["cleaned_preds"].str[0].astype("category").cat.codes
271267
df["last_aa"] = df["cleaned_preds"].str[-1].astype("category").cat.codes
272268

@@ -299,29 +295,23 @@ def build_reference_free_features(df, aa_properties, protease_rules):
299295
for s, p in zip(df["cleaned_preds"].fillna(""), prots_list, strict=False)
300296
]
301297

302-
df["proline_block_at_cterm"] = (
303-
df["cleaned_preds"].fillna("").apply(proline_block_at_cterm)
304-
)
298+
df["proline_block_at_cterm"] = df["cleaned_preds"].fillna("").apply(proline_block_at_cterm)
305299
df["protease"] = df["protease"].astype("category").cat.codes
306300

307301
return df
308302

309303

310304
def train_model(df, reference_seq, model_path, aa_properties, protease_rules):
311305
"""Train Random Forest classifier and save model with optimal threshold."""
312-
df["mapped"] = df["cleaned_preds"].apply(
313-
lambda x: int(isinstance(x, str) and x in reference_seq)
314-
)
306+
df["mapped"] = df["cleaned_preds"].apply(lambda x: int(isinstance(x, str) and x in reference_seq))
315307
df = build_reference_free_features(df, aa_properties, protease_rules)
316308

317309
exclude = ["experiment_name", "scan_number", "preds", "cleaned_preds"]
318310
feature_cols = [c for c in df.columns if c not in exclude and c != "mapped"]
319311

320312
x = df[feature_cols]
321313
y = df["mapped"].astype(int)
322-
x_train, x_test, y_train, y_test = train_test_split(
323-
x, y, test_size=0.3, stratify=y, random_state=42
324-
)
314+
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, stratify=y, random_state=42)
325315

326316
model = RandomForestClassifier(n_estimators=500, random_state=42, n_jobs=-1)
327317
model.fit(x_train, y_train)
@@ -363,9 +353,7 @@ def plot_precision_recall(metrics, output_dir, filename="precision_recall_curve.
363353
best_idx = metrics["best_idx"]
364354
ap = metrics["ap"]
365355

366-
sns.lineplot(
367-
x=recall, y=precision, color="#2E86AB", linewidth=1, label=f"AP = {ap:.2f}"
368-
)
356+
sns.lineplot(x=recall, y=precision, color="#2E86AB", linewidth=1, label=f"AP = {ap:.2f}")
369357
plt.scatter(
370358
recall[best_idx],
371359
precision[best_idx],
@@ -438,25 +426,19 @@ def main():
438426
protein_norm = prep.normalize_sequence(protein)
439427
df = pd.read_csv(INPUT_DIR / f"{run}.csv")
440428

441-
df["protease"] = df["experiment_name"].apply(
442-
lambda name: prep.extract_protease(name, proteases)
443-
)
429+
df["protease"] = df["experiment_name"].apply(lambda name: prep.extract_protease(name, proteases))
444430

445431
df = prep.clean_dataframe(df)
446432

447433
df["cleaned_preds"] = df["preds"].apply(prep.remove_modifications)
448434

449435
cleaned_psms = df["cleaned_preds"].tolist()
450436

451-
filtered_psms = prep.filter_contaminants(
452-
cleaned_psms, run, FASTA_DIR / "contaminants.fasta"
453-
)
437+
filtered_psms = prep.filter_contaminants(cleaned_psms, run, FASTA_DIR / "contaminants.fasta")
454438

455439
df = df[df["cleaned_preds"].isin(filtered_psms)]
456440

457-
df["mapped"] = df["cleaned_preds"].apply(
458-
lambda x: int(isinstance(x, str) and x in protein_norm)
459-
)
441+
df["mapped"] = df["cleaned_preds"].apply(lambda x: int(isinstance(x, str) and x in protein_norm))
460442

461443
model_path = BASE_DIR / "peptide_selector.pkl"
462444
metrics = train_model(df, protein, model_path, aa_props, protease_rules)

scripts/opt_dbg.py

Lines changed: 9 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -56,9 +56,7 @@ def get_sample_metadata(run, chain="", json_path=JSON_DIR / "sample_metadata.jso
5656
raise ValueError(f"No metadata found for run '{run}' with chain '{chain}'.")
5757

5858

59-
def run_pipeline_dbg(
60-
conf, kmer_size, min_overlap, max_mismatches, min_identity, size_threshold
61-
):
59+
def run_pipeline_dbg(conf, kmer_size, min_overlap, max_mismatches, min_identity, size_threshold):
6260
ass_method = "dbg"
6361
run = "ma1"
6462

@@ -97,25 +95,19 @@ def run_pipeline_dbg(
9795

9896
df = pd.read_csv(INPUT_DIR / f"{run}.csv")
9997

100-
df["protease"] = df["experiment_name"].apply(
101-
lambda name: prep.extract_protease(name, proteases)
102-
)
98+
df["protease"] = df["experiment_name"].apply(lambda name: prep.extract_protease(name, proteases))
10399

104100
df = prep.clean_dataframe(df)
105101

106102
df["cleaned_preds"] = df["preds"].apply(prep.remove_modifications)
107103

108104
cleaned_psms = df["cleaned_preds"].tolist()
109105

110-
filtered_psms = prep.filter_contaminants(
111-
cleaned_psms, run, FASTA_DIR / "contaminants.fasta"
112-
)
106+
filtered_psms = prep.filter_contaminants(cleaned_psms, run, FASTA_DIR / "contaminants.fasta")
113107

114108
df = df[df["cleaned_preds"].isin(filtered_psms)]
115109

116-
df["mapped"] = df["cleaned_preds"].apply(
117-
lambda x: "True" if x in protein_norm else "False"
118-
)
110+
df["mapped"] = df["cleaned_preds"].apply(lambda x: "True" if x in protein_norm else "False")
119111

120112
df = df[df["conf"] > conf]
121113

@@ -153,9 +145,7 @@ def run_pipeline_dbg(
153145
"fasta",
154146
)
155147

156-
mapped_contigs = map.process_protein_contigs_scaffold(
157-
assembled_contigs, protein_norm, max_mismatches, min_identity
158-
)
148+
mapped_contigs = map.process_protein_contigs_scaffold(assembled_contigs, protein_norm, max_mismatches, min_identity)
159149

160150
df_contigs = map.create_dataframe_from_mapped_sequences(data=mapped_contigs)
161151

@@ -173,25 +163,19 @@ def run_pipeline_dbg(
173163

174164
assembled_scaffolds = sorted(assembled_scaffolds, key=len, reverse=True)
175165

176-
assembled_scaffolds = [
177-
scaffold for scaffold in assembled_scaffolds if len(scaffold) > size_threshold
178-
]
166+
assembled_scaffolds = [scaffold for scaffold in assembled_scaffolds if len(scaffold) > size_threshold]
179167

180168
assembled_scaffolds = dbg.merge_sequences(assembled_scaffolds)
181169

182170
assembled_scaffolds = list(set(assembled_scaffolds))
183171

184172
assembled_scaffolds = sorted(assembled_scaffolds, key=len, reverse=True)
185173

186-
assembled_scaffolds = [
187-
scaffold for scaffold in assembled_scaffolds if len(scaffold) > size_threshold
188-
]
174+
assembled_scaffolds = [scaffold for scaffold in assembled_scaffolds if len(scaffold) > size_threshold]
189175

190176
records = []
191177
for i, seq in enumerate(assembled_scaffolds):
192-
record = Bio.SeqRecord.SeqRecord(
193-
Bio.Seq.Seq(seq), id=f"scaffold_{i + 1}", description=f"length: {len(seq)}"
194-
)
178+
record = Bio.SeqRecord.SeqRecord(Bio.Seq.Seq(seq), id=f"scaffold_{i + 1}", description=f"length: {len(seq)}")
195179
records.append(record)
196180

197181
Bio.SeqIO.write(
@@ -207,9 +191,7 @@ def run_pipeline_dbg(
207191
min_identity=min_identity,
208192
)
209193

210-
df_scaffolds_mapped = map.create_dataframe_from_mapped_sequences(
211-
data=mapped_scaffolds
212-
)
194+
df_scaffolds_mapped = map.create_dataframe_from_mapped_sequences(data=mapped_scaffolds)
213195

214196
comp_stat.compute_assembly_statistics(
215197
df=df_scaffolds_mapped,

0 commit comments

Comments
 (0)