129129 "--modeling-score-threshold" ,
130130 type = float ,
131131 default = 0.95 ,
132- help = "Minimum score for modeling (default: 0.95)"
132+ help = "Score threshold for selecting high-quality PSMs for model training. "
133+ "If not specified (default 0.95), the threshold will be auto-adjusted based on detected score type: "
134+ "E-values (SpecEValue, expect) -> 0.01; "
135+ "Raw scores (RawScore, xcorr) -> 50th percentile (median); "
136+ "Probability scores (PEP) -> 0.95. "
137+ "For 'higher is better' scores: PSMs with score >= threshold are used. "
138+ "For 'lower is better' scores: PSMs with score <= threshold are used."
133139)
134140@click .option (
135141 "--scoring-threshold" ,
179185 default = False ,
180186 help = "Run all three algorithms (AScore, PhosphoRS, LucXor) and merge results" ,
181187)
188+ @click .option (
189+ "--score-type" ,
190+ type = str ,
191+ default = None ,
192+ help = "Score type to use for PSM filtering (default: auto-detect with priority: PEP > MSGF+ RawScore > Comet xcorr > SpecEValue)" ,
193+ )
182194def lucxor (
183195 input_spectrum ,
184196 input_id ,
@@ -203,6 +215,7 @@ def lucxor(
203215 log_file ,
204216 disable_split_by_charge ,
205217 compute_all_scores ,
218+ score_type ,
206219):
207220 """
208221 Modification site localization using pyLuciPHOr2 algorithm.
@@ -255,6 +268,7 @@ def lucxor(
255268 rt_tolerance = rt_tolerance ,
256269 debug = debug ,
257270 disable_split_by_charge = disable_split_by_charge ,
271+ score_type = score_type ,
258272 )
259273
260274 # Only call sys.exit if not being called from compute_all_scores
@@ -322,9 +336,181 @@ def __init__(self):
322336 self .model = None
323337 self .psms = []
324338 self .config = {}
339+
340+ def select_score_type (self , pep_ids : PeptideIdentificationList , user_score_type : Optional [str ] = None ) -> Tuple [str , bool ]:
341+ """
342+ Select the best available score type based on priority.
343+
344+ Priority order (context-aware):
345+ 1. Posterior Error Probability (Percolator PEP) - always highest priority
346+ 2. Search engine specific:
347+ - MSGF+: RawScore (MS:1002049) > SpecEValue (MS:1002052)
348+ - Comet: xcorr (COMET:xcorr) > expect
349+
350+ Rationale:
351+ - PEP: Machine learning calibrated, most reliable
352+ - RawScore/xcorr: Direct match quality, better for model training
353+ - E-values: Statistical significance, but database-size dependent
354+
355+ Args:
356+ pep_ids: List of peptide identifications
357+ user_score_type: User-specified score type (optional)
358+
359+ Returns:
360+ Tuple of (score_type, higher_score_better)
361+ """
362+ if not pep_ids or len (pep_ids ) == 0 :
363+ self .logger .warning ("No peptide identifications to determine score type" )
364+ return None , True
365+
366+ # If user specified a score type, try to use it
367+ if user_score_type :
368+ # Determine if it's higher or lower better based on known score types
369+ higher_better = True
370+ lower_better_scores = [
371+ "posterior error probability" , "pep" , "q-value" , "qvalue" ,
372+ "expect" , "evalue" , "e-value" , "specevalue" , "ms:1002052" , "ms:1002053" ,
373+ "ms:1002257" , # Comet expect MS ID
374+ "ms:1001491" , "ms:1001493"
375+ ]
376+
377+ if any (keyword in user_score_type .lower () for keyword in lower_better_scores ):
378+ higher_better = False
379+
380+ self .logger .info (f"Using user-specified score type: { user_score_type } (higher_better={ higher_better } )" )
381+ return user_score_type , higher_better
382+
383+ # Get current score type from first peptide identification
384+ current_score_type = pep_ids [0 ].getScoreType () if hasattr (pep_ids [0 ], "getScoreType" ) else ""
385+ current_score_type_lower = current_score_type .lower ()
386+
387+ # Define priority list: (score_type_pattern, higher_is_better, priority_level)
388+ # Priority levels (lower number = higher priority):
389+ # 1: Percolator PEP (machine learning calibrated, most reliable)
390+ # 2: Raw scores (RawScore, xcorr - direct match quality, best for model training)
391+ # 3: E-values (SpecEValue, expect - statistical significance, both search engines)
392+ # Note: When priority is the same, order in the list matters - earlier entries are preferred
393+ preferred_scores = [
394+ ("posterior error probability" , False , 1 ), # Percolator PEP - always highest
395+ ("ms:1002049" , True , 2 ), # MSGF+ RawScore - best for MSGF+
396+ ("comet:xcorr" , True , 2 ), # Comet xcorr - best for Comet (same priority as RawScore)
397+ ("specevalue" , False , 3 ), # MSGF+ SpecEValue (matches score_type="SpecEValue") - checked first
398+ ("ms:1002052" , False , 3 ), # MSGF+ SpecEValue MS ID (matches UserParam "MS:1002052") - checked second
399+ ("expect" , False , 3 ), # Comet E-value (matches score_type="expect") - checked third
400+ ("ms:1002257" , False , 3 ), # Comet expect MS ID (matches UserParam "MS:1002257") - checked fourth
401+ ]
402+
403+ # Check current score type priority
404+ current_priority = 999 # Default low priority
405+ current_higher_better = True
406+ for score_pattern , higher_better , priority in preferred_scores :
407+ if score_pattern in current_score_type_lower :
408+ current_priority = priority
409+ current_higher_better = higher_better
410+ break
411+
412+ # Check UserParam scores and find the highest priority one
413+ best_userparam_score = None
414+ best_userparam_priority = 999
415+ best_userparam_higher_better = True
416+
417+ if len (pep_ids [0 ].getHits ()) > 0 :
418+ hit = pep_ids [0 ].getHits ()[0 ]
419+
420+ for score_pattern , higher_better , priority in preferred_scores :
421+ # Check if this score exists as a UserParam (case-insensitive)
422+ # Try exact match first, then case-insensitive
423+ score_exists = False
424+ actual_score_name = None
425+
426+ # Try exact pattern match
427+ if hit .metaValueExists (score_pattern ):
428+ score_exists = True
429+ actual_score_name = score_pattern
430+ else :
431+ # Try common variations
432+ variations = [
433+ score_pattern ,
434+ score_pattern .upper (),
435+ score_pattern .title (),
436+ "COMET:xcorr" if score_pattern == "comet:xcorr" else None ,
437+ "MS:1002049" if score_pattern == "ms:1002049" else None ,
438+ "MS:1002052" if score_pattern == "ms:1002052" else None ,
439+ "MS:1002257" if score_pattern == "ms:1002257" else None ,
440+ ]
441+
442+ for variation in variations :
443+ if variation and hit .metaValueExists (variation ):
444+ score_exists = True
445+ actual_score_name = variation
446+ break
447+
448+ if score_exists and priority < best_userparam_priority :
449+ best_userparam_score = actual_score_name
450+ best_userparam_priority = priority
451+ best_userparam_higher_better = higher_better
452+
453+ # Compare current score type with best UserParam score
454+ # When priorities are equal, prefer UserParam score (as it appears earlier in preferred_scores list)
455+ if best_userparam_score and best_userparam_priority <= current_priority :
456+ return best_userparam_score , best_userparam_higher_better
457+ elif current_priority < 999 :
458+ return current_score_type , current_higher_better
459+
460+ # Fallback: use current score type and try to determine direction
461+ higher_score_better = True
462+ if hasattr (pep_ids [0 ], "isHigherScoreBetter" ):
463+ higher_score_better = pep_ids [0 ].isHigherScoreBetter ()
464+ elif hasattr (pep_ids [0 ], "getHigherScoreBetter" ):
465+ higher_score_better = pep_ids [0 ].getHigherScoreBetter ()
466+
467+ self .logger .warning (f"Using fallback score type: { current_score_type } (higher_better={ higher_score_better } )" )
468+ return current_score_type , higher_score_better
469+
470+ def get_psm_score (self , pep_id : PeptideIdentification , hit , score_type : str , higher_score_better : bool ) -> float :
471+ """
472+ Extract score from peptide hit based on score type.
473+
474+ Args:
475+ pep_id: PeptideIdentification object
476+ hit: PeptideHit object
477+ score_type: Name of the score type to extract
478+ higher_score_better: Whether higher scores are better
479+
480+ Returns:
481+ Normalized score (always higher is better, range 0-1 for probability-based scores)
482+ """
483+ score = hit .getScore ()
484+
485+ # Check if score exists as UserParam
486+ if hit .metaValueExists (score_type ):
487+ score = float (hit .getMetaValue (score_type ))
488+
489+ # Normalize score based on type
490+ score_type_lower = (score_type or "" ).lower ()
491+ is_probability_score = any (k in score_type_lower for k in [
492+ "posterior error probability" , "pep" , "q-value" , "qvalue" ,
493+ "ms:1001493" , "ms:1001491"
494+ ])
495+
496+ if is_probability_score :
497+ # For probability-based scores (PEP, Q-value), convert to 1-score
498+ # These are already probabilities between 0 and 1
499+ if 0 <= score <= 1 :
500+ return 1.0 - score
501+ else :
502+ self .logger .warning (f"PEP/Q-value out of range [0,1]: { score } , using as-is" )
503+ return score
504+
505+ # For E-values and other "lower is better" scores, keep as-is
506+ # Don't transform them - they will be handled differently in filtering
507+ if not higher_score_better :
508+ return score
509+
510+ return score
325511
326512 def load_input_files (
327- self , input_id : str , input_spectrum : str
513+ self , input_id : str , input_spectrum : str , score_type : Optional [ str ] = None
328514 ) -> Tuple [PeptideIdentificationList , List [ProteinIdentification ], MSExperiment ]:
329515 """Load input files"""
330516 # Load identifications
@@ -338,6 +524,21 @@ def load_input_files(
338524
339525 # Keep only best hits
340526 IDFilter ().keepNBestHits (pep_ids , 1 )
527+
528+ # Select best score type based on priority
529+ score_type_param , higher_score_better = self .select_score_type (pep_ids , user_score_type = score_type )
530+
531+ # Override higher_score_better for probability scores after conversion
532+ # Since we convert PEP/Q-value to 1-score, they become "higher is better"
533+ if score_type_param and any (k in score_type_param .lower () for k in [
534+ "posterior error probability" , "pep" , "q-value" , "qvalue" ,
535+ "ms:1001493" , "ms:1001491"
536+ ]):
537+ higher_score_better = True
538+
539+ # Store score info for later use
540+ self .score_type = score_type_param
541+ self .higher_score_better = higher_score_better
341542
342543 # Load spectra
343544 exp = MSExperiment ()
@@ -385,6 +586,7 @@ def run(
385586 rt_tolerance : float ,
386587 debug : bool ,
387588 disable_split_by_charge : bool = False ,
589+ score_type : Optional [str ] = None ,
388590 ) -> int :
389591 """
390592 LuciPHOr2 main workflow:
@@ -433,7 +635,7 @@ def run(
433635 self .logger .debug (f"Debug mode: { debug } " )
434636 self .logger .debug (f"Log level: { logging .getLogger ().level } " )
435637
436- pep_ids , prot_ids , exp = self .load_input_files (input_id , input_spectrum )
638+ pep_ids , prot_ids , exp = self .load_input_files (input_id , input_spectrum , score_type )
437639 if not pep_ids or exp is None :
438640 self .logger .error ("No valid peptide identification or spectrum data found, process terminated." )
439641 return 1
@@ -523,22 +725,17 @@ def run(
523725 # Set search_engine_sequence to the original sequence
524726 psm .search_engine_sequence = sequence
525727
526- # Automatically determine score type and assign values
527- score_type = pep_id .getScoreType () if hasattr (pep_id , "getScoreType" ) else None
528- score = hit .getScore ()
529- higher_score_better = True
530- if hasattr (pep_id , "isHigherScoreBetter" ):
531- higher_score_better = pep_id .isHigherScoreBetter ()
532- elif hasattr (pep_id , "getHigherScoreBetter" ):
533- higher_score_better = pep_id .getHigherScoreBetter ()
728+ # Get score using the selected score type
729+ score = self .get_psm_score (pep_id , hit , self .score_type , self .higher_score_better )
730+
731+ # Assign score to PSM and peptide
732+ psm .psm_score = score
733+ peptide .score = score
734+
735+ # Store score metadata for filtering
736+ psm .score_type = self .score_type
737+ psm .higher_score_better = self .higher_score_better
534738
535- # If it"s a PEP score (lower is better), convert to 1-PEP
536- if (score_type and "posterior error probability" in score_type .lower ()) or (higher_score_better is False ):
537- psm .psm_score = 1.0 - score
538- peptide .score = 1.0 - score
539- else :
540- psm .psm_score = score
541- peptide .score = score
542739 all_psms .append (psm )
543740 else :
544741 self .logger .warning (f'No matching spectrum found - RT: { rt } , Scan: { scan_num if scan_num else "N/A" } ' )
@@ -550,6 +747,62 @@ def run(
550747 # 3. Train model with high-scoring PSMs
551748 modeling_score_threshold_val = config .get ("modeling_score_threshold" , 0.95 )
552749
750+ # Auto-adjust modeling_score_threshold based on score_type if using default value
751+ # Only adjust if user didn't explicitly set a non-default threshold
752+ user_set_threshold = modeling_score_threshold != 0.95 # Check if user changed from default
753+
754+ if not user_set_threshold :
755+ # Auto-adjust threshold based on detected score type
756+ score_type_lower = self .score_type .lower ()
757+
758+ # Check if it's a probability score (case-insensitive)
759+ is_probability_score = any (keyword in score_type_lower for keyword in [
760+ "posterior error probability" , "pep" , "q-value" , "qvalue" ,
761+ "ms:1001493" , "ms:1001491"
762+ ])
763+
764+ # Check if it's an E-value score
765+ is_evalue_score = any (keyword in score_type_lower for keyword in [
766+ "evalue" , "e-value" , "expect" , "specevalue" , "ms:1002052" , "ms:1002257"
767+ ])
768+
769+ # Check if it's a raw score
770+ is_raw_score = any (keyword in score_type_lower for keyword in [
771+ "rawscore" , "ms:1002049" , "xcorr" , "comet:xcorr"
772+ ])
773+
774+ if is_probability_score :
775+ # For PEP/Q-value (already transformed to 1-score, so higher is better)
776+ modeling_score_threshold_val = 0.95
777+ self .logger .info (f"Auto-adjusted modeling_score_threshold to { modeling_score_threshold_val } for probability-based score" )
778+ elif is_evalue_score :
779+ # For E-values (lower is better)
780+ modeling_score_threshold_val = 0.01
781+ self .logger .info (f"Auto-adjusted modeling_score_threshold to { modeling_score_threshold_val } for E-value score type" )
782+ elif is_raw_score :
783+ # For raw scores, use percentile-based approach
784+ all_scores = [psm .psm_score for psm in all_psms if hasattr (psm , "psm_score" ) and not np .isnan (psm .psm_score )]
785+ if all_scores :
786+ # Use 50th percentile (median) as threshold for raw scores
787+ # This balances quality and quantity, keeping top 50% of PSMs
788+ modeling_score_threshold_val = np .percentile (all_scores , 50 )
789+ self .logger .info (f"Auto-adjusted modeling_score_threshold to { modeling_score_threshold_val :.2f} (50th percentile) for raw score type" )
790+ else :
791+ modeling_score_threshold_val = 0.0 # Fallback: accept all
792+ self .logger .warning (f"No valid scores found, using threshold { modeling_score_threshold_val } " )
793+ else :
794+ # Unknown score type, keep default
795+ modeling_score_threshold_val = 0.95
796+ self .logger .info (f"Using default modeling_score_threshold { modeling_score_threshold_val } for unknown score type" )
797+
798+ # Update config with adjusted threshold
799+ config ["modeling_score_threshold" ] = modeling_score_threshold_val
800+
801+ # Log the score type and threshold being used
802+ self .logger .info (f"Using score type '{ self .score_type } ' for PSM filtering" )
803+ self .logger .info (f"Score direction: { 'higher is better' if self .higher_score_better else 'lower is better' } " )
804+ self .logger .info (f"Final modeling score threshold: { modeling_score_threshold_val } " )
805+
553806 # First filter PSMs with modification sites
554807 phospho_psms = []
555808 for psm in all_psms :
@@ -563,9 +816,28 @@ def run(
563816 phospho_psms .append (psm )
564817
565818 # Then filter high-scoring PSMs from PSMs with modification sites
566- high_score_psms = [psm for psm in phospho_psms if hasattr (psm , "psm_score" ) and psm .psm_score >= modeling_score_threshold_val ]
567- if not high_score_psms :
568- high_score_psms = [psm for psm in phospho_psms if hasattr (psm , "peptide" ) and hasattr (psm .peptide , "score" ) and psm .peptide .score >= modeling_score_threshold_val ]
819+ # Use correct comparison based on score direction
820+ high_score_psms = []
821+ for psm in phospho_psms :
822+ if hasattr (psm , "psm_score" ):
823+ # For "higher is better" scores, use >= threshold
824+ # For "lower is better" scores (like E-values), use <= threshold
825+ if self .higher_score_better :
826+ if psm .psm_score >= modeling_score_threshold_val :
827+ high_score_psms .append (psm )
828+ else :
829+ # For "lower is better" scores (E-values), use the modeling_score_threshold directly
830+ # Lower threshold means more stringent filtering
831+ if psm .psm_score <= modeling_score_threshold_val :
832+ high_score_psms .append (psm )
833+ elif hasattr (psm , "peptide" ) and hasattr (psm .peptide , "score" ):
834+ if self .higher_score_better :
835+ if psm .peptide .score >= modeling_score_threshold_val :
836+ high_score_psms .append (psm )
837+ else :
838+ # For "lower is better" scores, use the modeling_score_threshold directly
839+ if psm .peptide .score <= modeling_score_threshold_val :
840+ high_score_psms .append (psm )
569841
570842 self .logger .info (f"Total PSMs: { len (all_psms )} " )
571843 self .logger .info (f"PSMs with modification sites: { len (phospho_psms )} " )
0 commit comments