Skip to content

Commit dbdc46e

Browse files
authored
Merge pull request #32 from bigbio/claude/pyopenms-spectrum-generation
Use PyOpenMS for theoretical spectrum generation in LucXor
2 parents e578233 + 481f269 commit dbdc46e

19 files changed

Lines changed: 2132 additions & 1194 deletions

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ onsite provides three complementary algorithms for PTM localization:
4747
### Prerequisites
4848

4949
- Python 3.11+
50-
- PyOpenMS 3.4.0+
50+
- PyOpenMS 3.5.0+
5151
- NumPy 2.3.2+
5252
- SciPy 1.16.1+
5353

docs/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ onsite provides three complementary algorithms for PTM localization:
7979
### Prerequisites
8080

8181
- Python 3.11+
82-
- PyOpenMS 3.4.0+
82+
- PyOpenMS 3.5.0+
8383
- NumPy 2.3.2+
8484
- SciPy 1.16.1+
8585

onsite/ascore/cli.py

Lines changed: 21 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
import logging
77
import traceback
88
from datetime import datetime
9-
from concurrent.futures import ProcessPoolExecutor, as_completed
9+
from concurrent.futures import ThreadPoolExecutor, as_completed
1010

1111
import click
1212
from pyopenms import *
@@ -143,7 +143,7 @@ def ascore(
143143

144144
# Main processing loop
145145
start_time = time.time()
146-
processed_peptide_ids = []
146+
processed_peptide_ids = PeptideIdentificationList()
147147

148148
# Process each PeptideIdentification (optionally in parallel)
149149
if max(1, int(threads)) == 1:
@@ -165,7 +165,7 @@ def ascore(
165165
)
166166

167167
if result["status"] == "success":
168-
processed_peptide_ids.append(result["new_pid"])
168+
processed_peptide_ids.push_back(result["new_pid"])
169169
stats["processed"] += 1
170170
phospho_count = len([h for h in result["new_pid"].getHits()
171171
if "(Phospho)" in h.getSequence().toString()])
@@ -187,13 +187,13 @@ def ascore(
187187
else:
188188
workers = max(1, int(threads))
189189
click.echo(
190-
f"[{time.strftime('%H:%M:%S')}] Parallel execution with {workers} processes"
190+
f"[{time.strftime('%H:%M:%S')}] Parallel execution with {workers} threads"
191191
)
192192

193193
if debug:
194194
logger.info(f"Starting parallel processing with {workers} workers")
195195

196-
# Build serializable tasks in dict format
196+
# Build tasks - with threads we can pass objects directly (shared memory)
197197
params = {
198198
"fragment_mass_tolerance": fragment_mass_tolerance,
199199
"fragment_mass_unit": fragment_mass_unit,
@@ -208,7 +208,7 @@ def ascore(
208208
hit_payloads.append({"sequence": seq_str, "proforma": proforma})
209209
tasks.append({
210210
"idx": idx,
211-
"mzml_path": in_file,
211+
"exp": exp, # Pass spectrum object directly - shared between threads
212212
"params": params,
213213
"pid": {
214214
"mz": pid.getMZ(),
@@ -221,8 +221,8 @@ def ascore(
221221
logger.info(f"Created {len(tasks)} parallel tasks")
222222

223223
indexed_results = {}
224-
with ProcessPoolExecutor(max_workers=workers) as executor:
225-
futures = {executor.submit(_worker_process_pid, t): t["idx"] for t in tasks}
224+
with ThreadPoolExecutor(max_workers=workers) as executor:
225+
futures = {executor.submit(_worker_process_pid_threaded, t): t["idx"] for t in tasks}
226226
for fut in as_completed(futures):
227227
idx = futures[fut]
228228
try:
@@ -277,7 +277,7 @@ def ascore(
277277
new_hits.append(new_hit)
278278

279279
new_pid.setHits(new_hits)
280-
processed_peptide_ids.append(new_pid)
280+
processed_peptide_ids.push_back(new_pid)
281281
stats["processed"] += 1
282282
phospho_count = len([h for h in new_pid.getHits() if "(Phospho)" in h.getSequence().toString()])
283283
stats["phospho"] += phospho_count
@@ -337,7 +337,7 @@ def load_identifications(idxml_file):
337337
"""Load identification results"""
338338
print(f"[{time.strftime('%H:%M:%S')}] Loading identifications from {idxml_file}")
339339
protein_ids = []
340-
peptide_ids = []
340+
peptide_ids = PeptideIdentificationList()
341341
IdXMLFile().load(idxml_file, protein_ids, peptide_ids)
342342
print(f"Loaded {len(peptide_ids)} peptide identifications")
343343
return protein_ids, peptide_ids
@@ -440,34 +440,24 @@ def find_spectrum_by_mz(exp, target_mz, rt=None, ppm_tolerance=10):
440440
return best_match
441441

442442

443-
# ----------------------- Multiprocessing worker utilities -----------------------
444-
_WORKER_EXP = None
443+
# ----------------------- Threading worker utilities -----------------------
444+
# Note: Using ThreadPoolExecutor instead of ProcessPoolExecutor allows threads
445+
# to share the spectrum data (exp object) directly without reloading the file.
446+
# This provides significant performance improvement for parallel processing.
445447

446448

447-
def _worker_get_exp(mzml_file):
448-
global _WORKER_EXP
449-
if _WORKER_EXP is None:
450-
exp = MSExperiment()
451-
FileHandler().loadExperiment(mzml_file, exp)
452-
# Warm up spectrum index inside the worker for faster lookups
453-
if exp.size() > 0:
454-
# Rebuild local cache for find_spectrum_by_mz in this process
455-
if hasattr(find_spectrum_by_mz, "spectrum_list"):
456-
delattr(find_spectrum_by_mz, "spectrum_list")
457-
_ = find_spectrum_by_mz(exp, 0.0, None)
458-
_WORKER_EXP = exp
459-
return _WORKER_EXP
449+
def _worker_process_pid_threaded(task):
450+
"""Thread-safe worker that uses shared spectrum data.
460451
461-
462-
def _worker_process_pid(task):
452+
Unlike process-based workers, threads share memory so we can pass
453+
the exp object directly without serialization or file reloading.
454+
"""
463455
try:
464-
mzml_path = task["mzml_path"]
456+
exp = task["exp"] # Shared spectrum object - no file reload needed
465457
pid_info = task["pid"]
466458
params = task["params"]
467459

468-
exp = _worker_get_exp(mzml_path)
469-
470-
# Find spectrum
460+
# Find spectrum (uses shared cache from main thread)
471461
spectrum = find_spectrum_by_mz(exp, pid_info["mz"], pid_info.get("rt"))
472462
if spectrum is None:
473463
return {"status": "error", "reason": "spectrum_not_found"}

onsite/lucxor/cli.py

Lines changed: 32 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
MzMLFile,
1919
MSExperiment,
2020
PeptideIdentification,
21+
PeptideIdentificationList,
2122
ProteinIdentification,
2223
IDFilter,
2324
)
@@ -324,16 +325,16 @@ def __init__(self):
324325

325326
def load_input_files(
326327
self, input_id: str, input_spectrum: str
327-
) -> Tuple[List[PeptideIdentification], List[ProteinIdentification], MSExperiment]:
328+
) -> Tuple[PeptideIdentificationList, List[ProteinIdentification], MSExperiment]:
328329
"""Load input files"""
329330
# Load identifications
330-
pep_ids = []
331+
pep_ids = PeptideIdentificationList()
331332
prot_ids = []
332333
IdXMLFile().load(input_id, prot_ids, pep_ids)
333334

334335
if not pep_ids:
335336
self.logger.warning("No peptide identifications found in input file")
336-
return [], [], None
337+
return PeptideIdentificationList(), [], None
337338

338339
# Keep only best hits
339340
IDFilter().keepNBestHits(pep_ids, 1)
@@ -437,9 +438,13 @@ def run(
437438
self.logger.error("No valid peptide identification or spectrum data found, process terminated.")
438439
return 1
439440

440-
# 1. Create scan number to spectrum mapping
441+
# 1. Create scan number to spectrum mapping AND RT-sorted index for fast lookup
441442
spectrum_map = {}
443+
rt_spectrum_list = [] # List of (RT, spectrum) tuples for binary search
444+
442445
for spectrum in exp:
446+
rt = spectrum.getRT()
447+
rt_spectrum_list.append((rt, spectrum))
443448
try:
444449
# Extract scan number from native ID
445450
native_id = spectrum.getNativeID()
@@ -456,6 +461,10 @@ def run(
456461
except Exception as e:
457462
self.logger.warning(f"Cannot extract scan number from native ID: {native_id}, error: {str(e)}")
458463

464+
# Sort by RT for binary search (O(n log n) once, then O(log n) per lookup)
465+
rt_spectrum_list.sort(key=lambda x: x[0])
466+
rt_values = np.array([rt for rt, _ in rt_spectrum_list])
467+
459468
# 2. Collect all PSM objects
460469
all_psms = []
461470
for i, pep_id in enumerate(pep_ids, 1):
@@ -481,10 +490,23 @@ def run(
481490
spectrum = spectrum_map[scan_num]
482491
self.logger.debug(f"Found matching spectrum by scan number {scan_num}")
483492
else:
484-
# If scan number unavailable or no match found, try RT matching
485-
for spec in exp:
486-
if abs(spec.getRT() - rt) <= rt_tolerance:
487-
spectrum = spec
493+
# If scan number unavailable or no match found, try RT matching with binary search
494+
# O(log n) instead of O(n) per lookup
495+
idx = np.searchsorted(rt_values, rt)
496+
497+
# Check candidates near the insertion point
498+
candidates = []
499+
if idx > 0:
500+
candidates.append(idx - 1)
501+
if idx < len(rt_values):
502+
candidates.append(idx)
503+
if idx + 1 < len(rt_values):
504+
candidates.append(idx + 1)
505+
506+
for candidate_idx in candidates:
507+
candidate_rt, candidate_spec = rt_spectrum_list[candidate_idx]
508+
if abs(candidate_rt - rt) <= rt_tolerance:
509+
spectrum = candidate_spec
488510
self.logger.debug(f"Found matching spectrum by RT {rt}")
489511
break
490512

@@ -620,7 +642,7 @@ def run(
620642
self.logger.info("Second round calculation completed")
621643

622644
# 6. Write results to output file (using second round calculation results)
623-
new_pep_ids = []
645+
new_pep_ids = PeptideIdentificationList()
624646
phospho_count = 0
625647
for psm in all_psms:
626648
idx = all_psms.index(psm)
@@ -660,8 +682,7 @@ def run(
660682
new_pep_id.setScoreType("Luciphor_delta_score")
661683
new_pep_id.setHigherScoreBetter(True)
662684
new_pep_id.setHits([hit])
663-
new_pep_id.assignRanks()
664-
new_pep_ids.append(new_pep_id)
685+
new_pep_ids.push_back(new_pep_id)
665686

666687
# Count phosphorylated peptides
667688
try:

onsite/lucxor/constants.py

Lines changed: 24 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22
Constants and default configurations for pyLuciPHOr2
33
"""
44

5+
from . import mass_provider
6+
57
# Algorithm types
68
ALGORITHM_CID = 0
79
ALGORITHM_HCD = 1
@@ -37,46 +39,29 @@
3739
XTDHYPERSCORE = 3
3840
XCORR = 4
3941

40-
# Physical constants
41-
WATER_MASS = 18.010564684
42-
PROTON_MASS = 1.00727646688
42+
# Physical constants - derived from PyOpenMS
43+
WATER_MASS = mass_provider.get_water_mass()
44+
PROTON_MASS = mass_provider.get_proton_mass()
4345
PPM = 1.0 / 1000000.0
4446
TINY_NUM = 1e-10
4547
MIN_DELTA_SCORE = 0.1
4648
FUNCTION_TIME_LIMIT = 120 # seconds
4749

48-
# Amino acid masses (monoisotopic)
49-
AA_MASSES = {
50-
"A": 71.03711,
51-
"C": 103.00919,
52-
"D": 115.02694,
53-
"E": 129.04259,
54-
"F": 147.06841,
55-
"G": 57.02146,
56-
"H": 137.05891,
57-
"I": 113.08406,
58-
"K": 128.09496,
59-
"L": 113.08406,
60-
"M": 131.04049,
61-
"N": 114.04293,
62-
"P": 97.05276,
63-
"Q": 128.05858,
64-
"R": 156.10111,
65-
"S": 87.03203,
66-
"T": 101.04768,
67-
"V": 99.06841,
68-
"W": 186.07931,
69-
"Y": 163.06333,
70-
}
50+
# Modification masses - derived from PyOpenMS
51+
PHOSPHO_MOD_MASS = mass_provider.get_phospho_mass()
52+
OXIDATION_MASS = mass_provider.get_oxidation_mass()
53+
54+
# Amino acid masses (monoisotopic) - derived from PyOpenMS ResidueDB
55+
AA_MASSES = mass_provider.get_all_aa_masses()
7156

7257
# Add lowercase letter mass definitions for modification sites (including modification mass)
7358
AA_MASSES.update(
7459
{
75-
"s": 87.03203 + 79.966331, # Ser + phosphorylation
76-
"t": 101.04768 + 79.966331, # Thr + phosphorylation
77-
"y": 163.06333 + 79.966331, # Tyr + phosphorylation
78-
"a": 71.03711 + 79.966331, # Ala + PhosphoDecoy
79-
"m": 131.04049 + 15.994915, # Met + oxidation
60+
"s": AA_MASSES["S"] + PHOSPHO_MOD_MASS, # Ser + phosphorylation
61+
"t": AA_MASSES["T"] + PHOSPHO_MOD_MASS, # Thr + phosphorylation
62+
"y": AA_MASSES["Y"] + PHOSPHO_MOD_MASS, # Tyr + phosphorylation
63+
"a": AA_MASSES["A"] + PHOSPHO_MOD_MASS, # Ala + PhosphoDecoy
64+
"m": AA_MASSES["M"] + OXIDATION_MASS, # Met + oxidation
8065
}
8166
)
8267

@@ -108,8 +93,8 @@
10893
AA_DECOY_MAP = {v: k for k, v in DECOY_AA_MAP.items()}
10994

11095
# Add mass definitions for all decoy symbols
111-
# decoy amino acid mass = original amino acid mass + decoyMass (79.966331)
112-
DECOY_MASS = 79.966331
96+
# decoy amino acid mass = original amino acid mass + decoyMass (Phospho mass)
97+
DECOY_MASS = PHOSPHO_MOD_MASS
11398
for decoy_aa, orig_aa in DECOY_AA_MAP.items():
11499
if decoy_aa not in AA_MASSES and orig_aa in AA_MASSES:
115100
AA_MASSES[decoy_aa] = AA_MASSES[orig_aa] + DECOY_MASS
@@ -173,10 +158,10 @@
173158
"Sequest Xcorr": 4,
174159
}
175160

176-
# Modification masses
161+
# Modification masses dict - derived from PyOpenMS
177162
MOD_MASSES = {
178-
"Phospho": 79.966331,
179-
"Oxidation": 15.994915,
163+
"Phospho": PHOSPHO_MOD_MASS,
164+
"Oxidation": OXIDATION_MASS,
180165
}
181166

182167
# Decoy amino acid mapping
@@ -186,10 +171,6 @@
186171
"Y": "F", # Tyr -> Phe
187172
}
188173

189-
# New constants
190-
PHOSPHO_MOD_MASS = 79.966331
191-
OXIDATION_MASS = 15.994915
192-
193174
# Character types
194175
SINGLE_CHAR = 0
195176

@@ -200,6 +181,6 @@
200181
# Minimum values
201182
MIN_NUM_NEG_PKS = 50000
202183

203-
# Physical constants
204-
WATER = 18.01056
205-
PROTON = 1.00728
184+
# Physical constants (aliases for backward compatibility)
185+
WATER = WATER_MASS
186+
PROTON = PROTON_MASS

0 commit comments

Comments
 (0)