1717__status__ = Dev
1818"""
1919
20-
2120import json
2221import re
2322from math import log2
@@ -242,9 +241,7 @@ def load_aa_properties(json_path):
242241def peptide_props (seq , aa_properties ):
243242 """Calculate hydrophobicity, mass stats, and basic residue fraction."""
244243 if not seq or not isinstance (seq , str ) or len (seq ) == 0 :
245- return pd .Series (
246- {"mean_hydro" : 0 , "mean_mass" : 0 , "mass_std" : 0 , "frac_basic" : 0 }
247- )
244+ return pd .Series ({"mean_hydro" : 0 , "mean_mass" : 0 , "mass_std" : 0 , "frac_basic" : 0 })
248245
249246 vals_h = [aa_properties .get (a , {"hydro" : 0 })["hydro" ] for a in seq ]
250247 vals_m = [aa_properties .get (a , {"mass" : 0 })["mass" ] for a in seq ]
@@ -265,9 +262,7 @@ def build_reference_free_features(df, aa_properties, protease_rules):
265262
266263 df = df .copy ()
267264 df ["seq_length" ] = df ["cleaned_preds" ].str .len ()
268- df ["has_special" ] = (
269- df ["cleaned_preds" ].str .contains (r"[^A-Z]" , regex = True ).astype (int )
270- )
265+ df ["has_special" ] = df ["cleaned_preds" ].str .contains (r"[^A-Z]" , regex = True ).astype (int )
271266 df ["first_aa" ] = df ["cleaned_preds" ].str [0 ].astype ("category" ).cat .codes
272267 df ["last_aa" ] = df ["cleaned_preds" ].str [- 1 ].astype ("category" ).cat .codes
273268
@@ -289,40 +284,34 @@ def build_reference_free_features(df, aa_properties, protease_rules):
289284
290285 df ["cterm_matches_protease" ] = [
291286 cterm_matches_any (s , p , protease_rules )
292- for s , p in zip (df ["cleaned_preds" ].fillna ("" ), prots_list )
287+ for s , p in zip (df ["cleaned_preds" ].fillna ("" ), prots_list , strict = False )
293288 ]
294289 df ["nterm_matches_protease" ] = [
295290 nterm_matches_any (s , p , protease_rules )
296- for s , p in zip (df ["cleaned_preds" ].fillna ("" ), prots_list )
291+ for s , p in zip (df ["cleaned_preds" ].fillna ("" ), prots_list , strict = False )
297292 ]
298293 df ["internal_expected_sites_min" ] = [
299294 internal_expected_sites_min (s , p , protease_rules )
300- for s , p in zip (df ["cleaned_preds" ].fillna ("" ), prots_list )
295+ for s , p in zip (df ["cleaned_preds" ].fillna ("" ), prots_list , strict = False )
301296 ]
302297
303- df ["proline_block_at_cterm" ] = (
304- df ["cleaned_preds" ].fillna ("" ).apply (proline_block_at_cterm )
305- )
298+ df ["proline_block_at_cterm" ] = df ["cleaned_preds" ].fillna ("" ).apply (proline_block_at_cterm )
306299 df ["protease" ] = df ["protease" ].astype ("category" ).cat .codes
307300
308301 return df
309302
310303
311304def train_model (df , reference_seq , model_path , aa_properties , protease_rules ):
312305 """Train Random Forest classifier and save model with optimal threshold."""
313- df ["mapped" ] = df ["cleaned_preds" ].apply (
314- lambda x : int (isinstance (x , str ) and x in reference_seq )
315- )
306+ df ["mapped" ] = df ["cleaned_preds" ].apply (lambda x : int (isinstance (x , str ) and x in reference_seq ))
316307 df = build_reference_free_features (df , aa_properties , protease_rules )
317308
318309 exclude = ["experiment_name" , "scan_number" , "preds" , "cleaned_preds" ]
319310 feature_cols = [c for c in df .columns if c not in exclude and c != "mapped" ]
320311
321312 x = df [feature_cols ]
322313 y = df ["mapped" ].astype (int )
323- x_train , x_test , y_train , y_test = train_test_split (
324- x , y , test_size = 0.3 , stratify = y , random_state = 42
325- )
314+ x_train , x_test , y_train , y_test = train_test_split (x , y , test_size = 0.3 , stratify = y , random_state = 42 )
326315
327316 model = RandomForestClassifier (n_estimators = 500 , random_state = 42 , n_jobs = - 1 )
328317 model .fit (x_train , y_train )
@@ -364,9 +353,7 @@ def plot_precision_recall(metrics, output_dir, filename="precision_recall_curve.
364353 best_idx = metrics ["best_idx" ]
365354 ap = metrics ["ap" ]
366355
367- sns .lineplot (
368- x = recall , y = precision , color = "#2E86AB" , linewidth = 1 , label = f"AP = { ap :.2f} "
369- )
356+ sns .lineplot (x = recall , y = precision , color = "#2E86AB" , linewidth = 1 , label = f"AP = { ap :.2f} " )
370357 plt .scatter (
371358 recall [best_idx ],
372359 precision [best_idx ],
@@ -439,25 +426,19 @@ def main():
439426 protein_norm = prep .normalize_sequence (protein )
440427 df = pd .read_csv (INPUT_DIR / f"{ run } .csv" )
441428
442- df ["protease" ] = df ["experiment_name" ].apply (
443- lambda name : prep .extract_protease (name , proteases )
444- )
429+ df ["protease" ] = df ["experiment_name" ].apply (lambda name : prep .extract_protease (name , proteases ))
445430
446431 df = prep .clean_dataframe (df )
447432
448433 df ["cleaned_preds" ] = df ["preds" ].apply (prep .remove_modifications )
449434
450435 cleaned_psms = df ["cleaned_preds" ].tolist ()
451436
452- filtered_psms = prep .filter_contaminants (
453- cleaned_psms , run , FASTA_DIR / "contaminants.fasta"
454- )
437+ filtered_psms = prep .filter_contaminants (cleaned_psms , run , FASTA_DIR / "contaminants.fasta" )
455438
456439 df = df [df ["cleaned_preds" ].isin (filtered_psms )]
457440
458- df ["mapped" ] = df ["cleaned_preds" ].apply (
459- lambda x : int (isinstance (x , str ) and x in protein_norm )
460- )
441+ df ["mapped" ] = df ["cleaned_preds" ].apply (lambda x : int (isinstance (x , str ) and x in protein_norm ))
461442
462443 model_path = BASE_DIR / "peptide_selector.pkl"
463444 metrics = train_model (df , protein , model_path , aa_props , protease_rules )
0 commit comments