Skip to content

Commit eb7f3be

Browse files
authored
Merge pull request #38 from weizhongchun/main
Optimize LucXor to support different score types
2 parents a9f9b78 + 68d0b0f commit eb7f3be

3 files changed

Lines changed: 1350 additions & 21 deletions

File tree

onsite/lucxor/cli.py

Lines changed: 293 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -129,7 +129,13 @@
129129
"--modeling-score-threshold",
130130
type=float,
131131
default=0.95,
132-
help="Minimum score for modeling (default: 0.95)"
132+
help="Score threshold for selecting high-quality PSMs for model training. "
133+
"If not specified (default 0.95), the threshold will be auto-adjusted based on detected score type: "
134+
"E-values (SpecEValue, expect) -> 0.01; "
135+
"Raw scores (RawScore, xcorr) -> 50th percentile (median); "
136+
"Probability scores (PEP) -> 0.95. "
137+
"For 'higher is better' scores: PSMs with score >= threshold are used. "
138+
"For 'lower is better' scores: PSMs with score <= threshold are used."
133139
)
134140
@click.option(
135141
"--scoring-threshold",
@@ -179,6 +185,12 @@
179185
default=False,
180186
help="Run all three algorithms (AScore, PhosphoRS, LucXor) and merge results",
181187
)
188+
@click.option(
189+
"--score-type",
190+
type=str,
191+
default=None,
192+
help="Score type to use for PSM filtering (default: auto-detect with priority: PEP > MSGF+ RawScore > Comet xcorr > SpecEValue)",
193+
)
182194
def lucxor(
183195
input_spectrum,
184196
input_id,
@@ -203,6 +215,7 @@ def lucxor(
203215
log_file,
204216
disable_split_by_charge,
205217
compute_all_scores,
218+
score_type,
206219
):
207220
"""
208221
Modification site localization using pyLuciPHOr2 algorithm.
@@ -255,6 +268,7 @@ def lucxor(
255268
rt_tolerance=rt_tolerance,
256269
debug=debug,
257270
disable_split_by_charge=disable_split_by_charge,
271+
score_type=score_type,
258272
)
259273

260274
# Only call sys.exit if not being called from compute_all_scores
@@ -322,9 +336,181 @@ def __init__(self):
322336
self.model = None
323337
self.psms = []
324338
self.config = {}
339+
340+
def select_score_type(self, pep_ids: PeptideIdentificationList, user_score_type: Optional[str] = None) -> Tuple[str, bool]:
341+
"""
342+
Select the best available score type based on priority.
343+
344+
Priority order (context-aware):
345+
1. Posterior Error Probability (Percolator PEP) - always highest priority
346+
2. Search engine specific:
347+
- MSGF+: RawScore (MS:1002049) > SpecEValue (MS:1002052)
348+
- Comet: xcorr (COMET:xcorr) > expect
349+
350+
Rationale:
351+
- PEP: Machine learning calibrated, most reliable
352+
- RawScore/xcorr: Direct match quality, better for model training
353+
- E-values: Statistical significance, but database-size dependent
354+
355+
Args:
356+
pep_ids: List of peptide identifications
357+
user_score_type: User-specified score type (optional)
358+
359+
Returns:
360+
Tuple of (score_type, higher_score_better)
361+
"""
362+
if not pep_ids or len(pep_ids) == 0:
363+
self.logger.warning("No peptide identifications to determine score type")
364+
return None, True
365+
366+
# If user specified a score type, try to use it
367+
if user_score_type:
368+
# Determine if it's higher or lower better based on known score types
369+
higher_better = True
370+
lower_better_scores = [
371+
"posterior error probability", "pep", "q-value", "qvalue",
372+
"expect", "evalue", "e-value", "specevalue", "ms:1002052", "ms:1002053",
373+
"ms:1002257", # Comet expect MS ID
374+
"ms:1001491", "ms:1001493"
375+
]
376+
377+
if any(keyword in user_score_type.lower() for keyword in lower_better_scores):
378+
higher_better = False
379+
380+
self.logger.info(f"Using user-specified score type: {user_score_type} (higher_better={higher_better})")
381+
return user_score_type, higher_better
382+
383+
# Get current score type from first peptide identification
384+
current_score_type = pep_ids[0].getScoreType() if hasattr(pep_ids[0], "getScoreType") else ""
385+
current_score_type_lower = current_score_type.lower()
386+
387+
# Define priority list: (score_type_pattern, higher_is_better, priority_level)
388+
# Priority levels (lower number = higher priority):
389+
# 1: Percolator PEP (machine learning calibrated, most reliable)
390+
# 2: Raw scores (RawScore, xcorr - direct match quality, best for model training)
391+
# 3: E-values (SpecEValue, expect - statistical significance, both search engines)
392+
# Note: When priority is the same, order in the list matters - earlier entries are preferred
393+
preferred_scores = [
394+
("posterior error probability", False, 1), # Percolator PEP - always highest
395+
("ms:1002049", True, 2), # MSGF+ RawScore - best for MSGF+
396+
("comet:xcorr", True, 2), # Comet xcorr - best for Comet (same priority as RawScore)
397+
("specevalue", False, 3), # MSGF+ SpecEValue (matches score_type="SpecEValue") - checked first
398+
("ms:1002052", False, 3), # MSGF+ SpecEValue MS ID (matches UserParam "MS:1002052") - checked second
399+
("expect", False, 3), # Comet E-value (matches score_type="expect") - checked third
400+
("ms:1002257", False, 3), # Comet expect MS ID (matches UserParam "MS:1002257") - checked fourth
401+
]
402+
403+
# Check current score type priority
404+
current_priority = 999 # Default low priority
405+
current_higher_better = True
406+
for score_pattern, higher_better, priority in preferred_scores:
407+
if score_pattern in current_score_type_lower:
408+
current_priority = priority
409+
current_higher_better = higher_better
410+
break
411+
412+
# Check UserParam scores and find the highest priority one
413+
best_userparam_score = None
414+
best_userparam_priority = 999
415+
best_userparam_higher_better = True
416+
417+
if len(pep_ids[0].getHits()) > 0:
418+
hit = pep_ids[0].getHits()[0]
419+
420+
for score_pattern, higher_better, priority in preferred_scores:
421+
# Check if this score exists as a UserParam (case-insensitive)
422+
# Try exact match first, then case-insensitive
423+
score_exists = False
424+
actual_score_name = None
425+
426+
# Try exact pattern match
427+
if hit.metaValueExists(score_pattern):
428+
score_exists = True
429+
actual_score_name = score_pattern
430+
else:
431+
# Try common variations
432+
variations = [
433+
score_pattern,
434+
score_pattern.upper(),
435+
score_pattern.title(),
436+
"COMET:xcorr" if score_pattern == "comet:xcorr" else None,
437+
"MS:1002049" if score_pattern == "ms:1002049" else None,
438+
"MS:1002052" if score_pattern == "ms:1002052" else None,
439+
"MS:1002257" if score_pattern == "ms:1002257" else None,
440+
]
441+
442+
for variation in variations:
443+
if variation and hit.metaValueExists(variation):
444+
score_exists = True
445+
actual_score_name = variation
446+
break
447+
448+
if score_exists and priority < best_userparam_priority:
449+
best_userparam_score = actual_score_name
450+
best_userparam_priority = priority
451+
best_userparam_higher_better = higher_better
452+
453+
# Compare current score type with best UserParam score
454+
# When priorities are equal, prefer UserParam score (as it appears earlier in preferred_scores list)
455+
if best_userparam_score and best_userparam_priority <= current_priority:
456+
return best_userparam_score, best_userparam_higher_better
457+
elif current_priority < 999:
458+
return current_score_type, current_higher_better
459+
460+
# Fallback: use current score type and try to determine direction
461+
higher_score_better = True
462+
if hasattr(pep_ids[0], "isHigherScoreBetter"):
463+
higher_score_better = pep_ids[0].isHigherScoreBetter()
464+
elif hasattr(pep_ids[0], "getHigherScoreBetter"):
465+
higher_score_better = pep_ids[0].getHigherScoreBetter()
466+
467+
self.logger.warning(f"Using fallback score type: {current_score_type} (higher_better={higher_score_better})")
468+
return current_score_type, higher_score_better
469+
470+
def get_psm_score(self, pep_id: PeptideIdentification, hit, score_type: str, higher_score_better: bool) -> float:
471+
"""
472+
Extract score from peptide hit based on score type.
473+
474+
Args:
475+
pep_id: PeptideIdentification object
476+
hit: PeptideHit object
477+
score_type: Name of the score type to extract
478+
higher_score_better: Whether higher scores are better
479+
480+
Returns:
481+
Normalized score (always higher is better, range 0-1 for probability-based scores)
482+
"""
483+
score = hit.getScore()
484+
485+
# Check if score exists as UserParam
486+
if hit.metaValueExists(score_type):
487+
score = float(hit.getMetaValue(score_type))
488+
489+
# Normalize score based on type
490+
score_type_lower = (score_type or "").lower()
491+
is_probability_score = any(k in score_type_lower for k in [
492+
"posterior error probability", "pep", "q-value", "qvalue",
493+
"ms:1001493", "ms:1001491"
494+
])
495+
496+
if is_probability_score:
497+
# For probability-based scores (PEP, Q-value), convert to 1-score
498+
# These are already probabilities between 0 and 1
499+
if 0 <= score <= 1:
500+
return 1.0 - score
501+
else:
502+
self.logger.warning(f"PEP/Q-value out of range [0,1]: {score}, using as-is")
503+
return score
504+
505+
# For E-values and other "lower is better" scores, keep as-is
506+
# Don't transform them - they will be handled differently in filtering
507+
if not higher_score_better:
508+
return score
509+
510+
return score
325511

326512
def load_input_files(
327-
self, input_id: str, input_spectrum: str
513+
self, input_id: str, input_spectrum: str, score_type: Optional[str] = None
328514
) -> Tuple[PeptideIdentificationList, List[ProteinIdentification], MSExperiment]:
329515
"""Load input files"""
330516
# Load identifications
@@ -338,6 +524,21 @@ def load_input_files(
338524

339525
# Keep only best hits
340526
IDFilter().keepNBestHits(pep_ids, 1)
527+
528+
# Select best score type based on priority
529+
score_type_param, higher_score_better = self.select_score_type(pep_ids, user_score_type=score_type)
530+
531+
# Override higher_score_better for probability scores after conversion
532+
# Since we convert PEP/Q-value to 1-score, they become "higher is better"
533+
if score_type_param and any(k in score_type_param.lower() for k in [
534+
"posterior error probability", "pep", "q-value", "qvalue",
535+
"ms:1001493", "ms:1001491"
536+
]):
537+
higher_score_better = True
538+
539+
# Store score info for later use
540+
self.score_type = score_type_param
541+
self.higher_score_better = higher_score_better
341542

342543
# Load spectra
343544
exp = MSExperiment()
@@ -385,6 +586,7 @@ def run(
385586
rt_tolerance: float,
386587
debug: bool,
387588
disable_split_by_charge: bool = False,
589+
score_type: Optional[str] = None,
388590
) -> int:
389591
"""
390592
LuciPHOr2 main workflow:
@@ -433,7 +635,7 @@ def run(
433635
self.logger.debug(f"Debug mode: {debug}")
434636
self.logger.debug(f"Log level: {logging.getLogger().level}")
435637

436-
pep_ids, prot_ids, exp = self.load_input_files(input_id, input_spectrum)
638+
pep_ids, prot_ids, exp = self.load_input_files(input_id, input_spectrum, score_type)
437639
if not pep_ids or exp is None:
438640
self.logger.error("No valid peptide identification or spectrum data found, process terminated.")
439641
return 1
@@ -523,22 +725,17 @@ def run(
523725
# Set search_engine_sequence to the original sequence
524726
psm.search_engine_sequence = sequence
525727

526-
# Automatically determine score type and assign values
527-
score_type = pep_id.getScoreType() if hasattr(pep_id, "getScoreType") else None
528-
score = hit.getScore()
529-
higher_score_better = True
530-
if hasattr(pep_id, "isHigherScoreBetter"):
531-
higher_score_better = pep_id.isHigherScoreBetter()
532-
elif hasattr(pep_id, "getHigherScoreBetter"):
533-
higher_score_better = pep_id.getHigherScoreBetter()
728+
# Get score using the selected score type
729+
score = self.get_psm_score(pep_id, hit, self.score_type, self.higher_score_better)
730+
731+
# Assign score to PSM and peptide
732+
psm.psm_score = score
733+
peptide.score = score
734+
735+
# Store score metadata for filtering
736+
psm.score_type = self.score_type
737+
psm.higher_score_better = self.higher_score_better
534738

535-
# If it"s a PEP score (lower is better), convert to 1-PEP
536-
if (score_type and "posterior error probability" in score_type.lower()) or (higher_score_better is False):
537-
psm.psm_score = 1.0 - score
538-
peptide.score = 1.0 - score
539-
else:
540-
psm.psm_score = score
541-
peptide.score = score
542739
all_psms.append(psm)
543740
else:
544741
self.logger.warning(f'No matching spectrum found - RT: {rt}, Scan: {scan_num if scan_num else "N/A"}')
@@ -550,6 +747,62 @@ def run(
550747
# 3. Train model with high-scoring PSMs
551748
modeling_score_threshold_val = config.get("modeling_score_threshold", 0.95)
552749

750+
# Auto-adjust modeling_score_threshold based on score_type if using default value
751+
# Only adjust if user didn't explicitly set a non-default threshold
752+
user_set_threshold = modeling_score_threshold != 0.95 # Check if user changed from default
753+
754+
if not user_set_threshold:
755+
# Auto-adjust threshold based on detected score type
756+
score_type_lower = self.score_type.lower()
757+
758+
# Check if it's a probability score (case-insensitive)
759+
is_probability_score = any(keyword in score_type_lower for keyword in [
760+
"posterior error probability", "pep", "q-value", "qvalue",
761+
"ms:1001493", "ms:1001491"
762+
])
763+
764+
# Check if it's an E-value score
765+
is_evalue_score = any(keyword in score_type_lower for keyword in [
766+
"evalue", "e-value", "expect", "specevalue", "ms:1002052", "ms:1002257"
767+
])
768+
769+
# Check if it's a raw score
770+
is_raw_score = any(keyword in score_type_lower for keyword in [
771+
"rawscore", "ms:1002049", "xcorr", "comet:xcorr"
772+
])
773+
774+
if is_probability_score:
775+
# For PEP/Q-value (already transformed to 1-score, so higher is better)
776+
modeling_score_threshold_val = 0.95
777+
self.logger.info(f"Auto-adjusted modeling_score_threshold to {modeling_score_threshold_val} for probability-based score")
778+
elif is_evalue_score:
779+
# For E-values (lower is better)
780+
modeling_score_threshold_val = 0.01
781+
self.logger.info(f"Auto-adjusted modeling_score_threshold to {modeling_score_threshold_val} for E-value score type")
782+
elif is_raw_score:
783+
# For raw scores, use percentile-based approach
784+
all_scores = [psm.psm_score for psm in all_psms if hasattr(psm, "psm_score") and not np.isnan(psm.psm_score)]
785+
if all_scores:
786+
# Use 50th percentile (median) as threshold for raw scores
787+
# This balances quality and quantity, keeping top 50% of PSMs
788+
modeling_score_threshold_val = np.percentile(all_scores, 50)
789+
self.logger.info(f"Auto-adjusted modeling_score_threshold to {modeling_score_threshold_val:.2f} (50th percentile) for raw score type")
790+
else:
791+
modeling_score_threshold_val = 0.0 # Fallback: accept all
792+
self.logger.warning(f"No valid scores found, using threshold {modeling_score_threshold_val}")
793+
else:
794+
# Unknown score type, keep default
795+
modeling_score_threshold_val = 0.95
796+
self.logger.info(f"Using default modeling_score_threshold {modeling_score_threshold_val} for unknown score type")
797+
798+
# Update config with adjusted threshold
799+
config["modeling_score_threshold"] = modeling_score_threshold_val
800+
801+
# Log the score type and threshold being used
802+
self.logger.info(f"Using score type '{self.score_type}' for PSM filtering")
803+
self.logger.info(f"Score direction: {'higher is better' if self.higher_score_better else 'lower is better'}")
804+
self.logger.info(f"Final modeling score threshold: {modeling_score_threshold_val}")
805+
553806
# First filter PSMs with modification sites
554807
phospho_psms = []
555808
for psm in all_psms:
@@ -563,9 +816,28 @@ def run(
563816
phospho_psms.append(psm)
564817

565818
# Then filter high-scoring PSMs from PSMs with modification sites
566-
high_score_psms = [psm for psm in phospho_psms if hasattr(psm, "psm_score") and psm.psm_score >= modeling_score_threshold_val]
567-
if not high_score_psms:
568-
high_score_psms = [psm for psm in phospho_psms if hasattr(psm, "peptide") and hasattr(psm.peptide, "score") and psm.peptide.score >= modeling_score_threshold_val]
819+
# Use correct comparison based on score direction
820+
high_score_psms = []
821+
for psm in phospho_psms:
822+
if hasattr(psm, "psm_score"):
823+
# For "higher is better" scores, use >= threshold
824+
# For "lower is better" scores (like E-values), use <= threshold
825+
if self.higher_score_better:
826+
if psm.psm_score >= modeling_score_threshold_val:
827+
high_score_psms.append(psm)
828+
else:
829+
# For "lower is better" scores (E-values), use the modeling_score_threshold directly
830+
# Lower threshold means more stringent filtering
831+
if psm.psm_score <= modeling_score_threshold_val:
832+
high_score_psms.append(psm)
833+
elif hasattr(psm, "peptide") and hasattr(psm.peptide, "score"):
834+
if self.higher_score_better:
835+
if psm.peptide.score >= modeling_score_threshold_val:
836+
high_score_psms.append(psm)
837+
else:
838+
# For "lower is better" scores, use the modeling_score_threshold directly
839+
if psm.peptide.score <= modeling_score_threshold_val:
840+
high_score_psms.append(psm)
569841

570842
self.logger.info(f"Total PSMs: {len(all_psms)}")
571843
self.logger.info(f"PSMs with modification sites: {len(phospho_psms)}")

0 commit comments

Comments
 (0)