bigbio
diff --git a/‎README.md‎
Lines changed: 1 addition & 1 deletion b/‎README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/README.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎onsite/ascore/cli.py‎
Lines changed: 21 additions & 31 deletions b/‎onsite/ascore/cli.py‎
Lines changed: 21 additions & 31 deletions
diff --git a/‎onsite/lucxor/cli.py‎
Lines changed: 32 additions & 11 deletions b/‎onsite/lucxor/cli.py‎
Lines changed: 32 additions & 11 deletions
diff --git a/‎onsite/lucxor/constants.py‎
Lines changed: 24 additions & 43 deletions b/‎onsite/lucxor/constants.py‎
Lines changed: 24 additions & 43 deletions
@@ -47,7 +47,7 @@ onsite provides three complementary algorithms for PTM localization:
 ### Prerequisites
 
 - Python 3.11+
-- PyOpenMS 3.4.0+
+- PyOpenMS 3.5.0+
 - NumPy 2.3.2+
 - SciPy 1.16.1+
 
 
@@ -79,7 +79,7 @@ onsite provides three complementary algorithms for PTM localization:
 ### Prerequisites
 
 - Python 3.11+
-- PyOpenMS 3.4.0+
+- PyOpenMS 3.5.0+
 - NumPy 2.3.2+
 - SciPy 1.16.1+
 
 
@@ -6,7 +6,7 @@
 import logging
 import traceback
 from datetime import datetime
-from concurrent.futures import ProcessPoolExecutor, as_completed
+from concurrent.futures import ThreadPoolExecutor, as_completed
 
 import click
 from pyopenms import *
@@ -143,7 +143,7 @@ def ascore(
 
         # Main processing loop
         start_time = time.time()
-        processed_peptide_ids = []
+        processed_peptide_ids = PeptideIdentificationList()
 
         # Process each PeptideIdentification (optionally in parallel)
         if max(1, int(threads)) == 1:
@@ -165,7 +165,7 @@ def ascore(
                     )
 
                     if result["status"] == "success":
-                        processed_peptide_ids.append(result["new_pid"])
+                        processed_peptide_ids.push_back(result["new_pid"])
                         stats["processed"] += 1
                         phospho_count = len([h for h in result["new_pid"].getHits() 
                                           if "(Phospho)" in h.getSequence().toString()])
@@ -187,13 +187,13 @@ def ascore(
         else:
             workers = max(1, int(threads))
             click.echo(
-                f"[{time.strftime('%H:%M:%S')}] Parallel execution with {workers} processes"
+                f"[{time.strftime('%H:%M:%S')}] Parallel execution with {workers} threads"
             )
 
             if debug:
                 logger.info(f"Starting parallel processing with {workers} workers")
 
-            # Build serializable tasks in dict format
+            # Build tasks - with threads we can pass objects directly (shared memory)
             params = {
                 "fragment_mass_tolerance": fragment_mass_tolerance,
                 "fragment_mass_unit": fragment_mass_unit,
@@ -208,7 +208,7 @@ def ascore(
                     hit_payloads.append({"sequence": seq_str, "proforma": proforma})
                 tasks.append({
                     "idx": idx,
-                    "mzml_path": in_file,
+                    "exp": exp,  # Pass spectrum object directly - shared between threads
                     "params": params,
                     "pid": {
                         "mz": pid.getMZ(),
@@ -221,8 +221,8 @@ def ascore(
                 logger.info(f"Created {len(tasks)} parallel tasks")
 
             indexed_results = {}
-            with ProcessPoolExecutor(max_workers=workers) as executor:
-                futures = {executor.submit(_worker_process_pid, t): t["idx"] for t in tasks}
+            with ThreadPoolExecutor(max_workers=workers) as executor:
+                futures = {executor.submit(_worker_process_pid_threaded, t): t["idx"] for t in tasks}
                 for fut in as_completed(futures):
                     idx = futures[fut]
                     try:
@@ -277,7 +277,7 @@ def ascore(
                         new_hits.append(new_hit)
 
                     new_pid.setHits(new_hits)
-                    processed_peptide_ids.append(new_pid)
+                    processed_peptide_ids.push_back(new_pid)
                     stats["processed"] += 1
                     phospho_count = len([h for h in new_pid.getHits() if "(Phospho)" in h.getSequence().toString()])
                     stats["phospho"] += phospho_count
@@ -337,7 +337,7 @@ def load_identifications(idxml_file):
     """Load identification results"""
     print(f"[{time.strftime('%H:%M:%S')}] Loading identifications from {idxml_file}")
     protein_ids = []
-    peptide_ids = []
+    peptide_ids = PeptideIdentificationList()
     IdXMLFile().load(idxml_file, protein_ids, peptide_ids)
     print(f"Loaded {len(peptide_ids)} peptide identifications")
     return protein_ids, peptide_ids
@@ -440,34 +440,24 @@ def find_spectrum_by_mz(exp, target_mz, rt=None, ppm_tolerance=10):
     return best_match
 
 
-# ----------------------- Multiprocessing worker utilities -----------------------
-_WORKER_EXP = None
+# ----------------------- Threading worker utilities -----------------------
+# Note: Using ThreadPoolExecutor instead of ProcessPoolExecutor allows threads
+# to share the spectrum data (exp object) directly without reloading the file.
+# This provides significant performance improvement for parallel processing.
 
 
-def _worker_get_exp(mzml_file):
-    global _WORKER_EXP
-    if _WORKER_EXP is None:
-        exp = MSExperiment()
-        FileHandler().loadExperiment(mzml_file, exp)
-        # Warm up spectrum index inside the worker for faster lookups
-        if exp.size() > 0:
-            # Rebuild local cache for find_spectrum_by_mz in this process
-            if hasattr(find_spectrum_by_mz, "spectrum_list"):
-                delattr(find_spectrum_by_mz, "spectrum_list")
-            _ = find_spectrum_by_mz(exp, 0.0, None)
-        _WORKER_EXP = exp
-    return _WORKER_EXP
+def _worker_process_pid_threaded(task):
+    """Thread-safe worker that uses shared spectrum data.
 
-
-def _worker_process_pid(task):
+    Unlike process-based workers, threads share memory so we can pass
+    the exp object directly without serialization or file reloading.
+    """
     try:
-        mzml_path = task["mzml_path"]
+        exp = task["exp"]  # Shared spectrum object - no file reload needed
         pid_info = task["pid"]
         params = task["params"]
 
-        exp = _worker_get_exp(mzml_path)
-
-        # Find spectrum
+        # Find spectrum (uses shared cache from main thread)
         spectrum = find_spectrum_by_mz(exp, pid_info["mz"], pid_info.get("rt"))
         if spectrum is None:
             return {"status": "error", "reason": "spectrum_not_found"}
 
@@ -18,6 +18,7 @@
     MzMLFile,
     MSExperiment,
     PeptideIdentification,
+    PeptideIdentificationList,
     ProteinIdentification,
     IDFilter,
 )
@@ -324,16 +325,16 @@ def __init__(self):
 
     def load_input_files(
         self, input_id: str, input_spectrum: str
-    ) -> Tuple[List[PeptideIdentification], List[ProteinIdentification], MSExperiment]:
+    ) -> Tuple[PeptideIdentificationList, List[ProteinIdentification], MSExperiment]:
         """Load input files"""
         # Load identifications
-        pep_ids = []
+        pep_ids = PeptideIdentificationList()
         prot_ids = []
         IdXMLFile().load(input_id, prot_ids, pep_ids)
 
         if not pep_ids:
             self.logger.warning("No peptide identifications found in input file")
-            return [], [], None
+            return PeptideIdentificationList(), [], None
 
         # Keep only best hits
         IDFilter().keepNBestHits(pep_ids, 1)
@@ -437,9 +438,13 @@ def run(
             self.logger.error("No valid peptide identification or spectrum data found, process terminated.")
             return 1
 
-        # 1. Create scan number to spectrum mapping
+        # 1. Create scan number to spectrum mapping AND RT-sorted index for fast lookup
         spectrum_map = {}
+        rt_spectrum_list = []  # List of (RT, spectrum) tuples for binary search
+
         for spectrum in exp:
+            rt = spectrum.getRT()
+            rt_spectrum_list.append((rt, spectrum))
             try:
                 # Extract scan number from native ID
                 native_id = spectrum.getNativeID()
@@ -456,6 +461,10 @@ def run(
             except Exception as e:
                 self.logger.warning(f"Cannot extract scan number from native ID: {native_id}, error: {str(e)}")
 
+        # Sort by RT for binary search (O(n log n) once, then O(log n) per lookup)
+        rt_spectrum_list.sort(key=lambda x: x[0])
+        rt_values = np.array([rt for rt, _ in rt_spectrum_list])
+
         # 2. Collect all PSM objects
         all_psms = []
         for i, pep_id in enumerate(pep_ids, 1):
@@ -481,10 +490,23 @@ def run(
                 spectrum = spectrum_map[scan_num]
                 self.logger.debug(f"Found matching spectrum by scan number {scan_num}")
             else:
-                # If scan number unavailable or no match found, try RT matching
-                for spec in exp:
-                    if abs(spec.getRT() - rt) <= rt_tolerance:
-                        spectrum = spec
+                # If scan number unavailable or no match found, try RT matching with binary search
+                # O(log n) instead of O(n) per lookup
+                idx = np.searchsorted(rt_values, rt)
+
+                # Check candidates near the insertion point
+                candidates = []
+                if idx > 0:
+                    candidates.append(idx - 1)
+                if idx < len(rt_values):
+                    candidates.append(idx)
+                if idx + 1 < len(rt_values):
+                    candidates.append(idx + 1)
+
+                for candidate_idx in candidates:
+                    candidate_rt, candidate_spec = rt_spectrum_list[candidate_idx]
+                    if abs(candidate_rt - rt) <= rt_tolerance:
+                        spectrum = candidate_spec
                         self.logger.debug(f"Found matching spectrum by RT {rt}")
                         break
 
@@ -620,7 +642,7 @@ def run(
         self.logger.info("Second round calculation completed")
 
         # 6. Write results to output file (using second round calculation results)
-        new_pep_ids = []
+        new_pep_ids = PeptideIdentificationList()
         phospho_count = 0
         for psm in all_psms:
             idx = all_psms.index(psm)
@@ -660,8 +682,7 @@ def run(
                 new_pep_id.setScoreType("Luciphor_delta_score")
                 new_pep_id.setHigherScoreBetter(True)
                 new_pep_id.setHits([hit])
-                new_pep_id.assignRanks()
-                new_pep_ids.append(new_pep_id)
+                new_pep_ids.push_back(new_pep_id)
 
                 # Count phosphorylated peptides
                 try:
 
@@ -2,6 +2,8 @@
 Constants and default configurations for pyLuciPHOr2
 """
 
+from . import mass_provider
+
 # Algorithm types
 ALGORITHM_CID = 0
 ALGORITHM_HCD = 1
@@ -37,46 +39,29 @@
 XTDHYPERSCORE = 3
 XCORR = 4
 
-# Physical constants
-WATER_MASS = 18.010564684
-PROTON_MASS = 1.00727646688
+# Physical constants - derived from PyOpenMS
+WATER_MASS = mass_provider.get_water_mass()
+PROTON_MASS = mass_provider.get_proton_mass()
 PPM = 1.0 / 1000000.0
 TINY_NUM = 1e-10
 MIN_DELTA_SCORE = 0.1
 FUNCTION_TIME_LIMIT = 120  # seconds
 
-# Amino acid masses (monoisotopic)
-AA_MASSES = {
-    "A": 71.03711,
-    "C": 103.00919,
-    "D": 115.02694,
-    "E": 129.04259,
-    "F": 147.06841,
-    "G": 57.02146,
-    "H": 137.05891,
-    "I": 113.08406,
-    "K": 128.09496,
-    "L": 113.08406,
-    "M": 131.04049,
-    "N": 114.04293,
-    "P": 97.05276,
-    "Q": 128.05858,
-    "R": 156.10111,
-    "S": 87.03203,
-    "T": 101.04768,
-    "V": 99.06841,
-    "W": 186.07931,
-    "Y": 163.06333,
-}
+# Modification masses - derived from PyOpenMS
+PHOSPHO_MOD_MASS = mass_provider.get_phospho_mass()
+OXIDATION_MASS = mass_provider.get_oxidation_mass()
+
+# Amino acid masses (monoisotopic) - derived from PyOpenMS ResidueDB
+AA_MASSES = mass_provider.get_all_aa_masses()
 
 # Add lowercase letter mass definitions for modification sites (including modification mass)
 AA_MASSES.update(
     {
-        "s": 87.03203 + 79.966331,  # Ser + phosphorylation
-        "t": 101.04768 + 79.966331,  # Thr + phosphorylation
-        "y": 163.06333 + 79.966331,  # Tyr + phosphorylation
-        "a": 71.03711 + 79.966331,  # Ala + PhosphoDecoy
-        "m": 131.04049 + 15.994915,  # Met + oxidation
+        "s": AA_MASSES["S"] + PHOSPHO_MOD_MASS,  # Ser + phosphorylation
+        "t": AA_MASSES["T"] + PHOSPHO_MOD_MASS,  # Thr + phosphorylation
+        "y": AA_MASSES["Y"] + PHOSPHO_MOD_MASS,  # Tyr + phosphorylation
+        "a": AA_MASSES["A"] + PHOSPHO_MOD_MASS,  # Ala + PhosphoDecoy
+        "m": AA_MASSES["M"] + OXIDATION_MASS,  # Met + oxidation
     }
 )
 
@@ -108,8 +93,8 @@
 AA_DECOY_MAP = {v: k for k, v in DECOY_AA_MAP.items()}
 
 # Add mass definitions for all decoy symbols
-# decoy amino acid mass = original amino acid mass + decoyMass (79.966331)
-DECOY_MASS = 79.966331
+# decoy amino acid mass = original amino acid mass + decoyMass (Phospho mass)
+DECOY_MASS = PHOSPHO_MOD_MASS
 for decoy_aa, orig_aa in DECOY_AA_MAP.items():
     if decoy_aa not in AA_MASSES and orig_aa in AA_MASSES:
         AA_MASSES[decoy_aa] = AA_MASSES[orig_aa] + DECOY_MASS
@@ -173,10 +158,10 @@
     "Sequest Xcorr": 4,
 }
 
-# Modification masses
+# Modification masses dict - derived from PyOpenMS
 MOD_MASSES = {
-    "Phospho": 79.966331,
-    "Oxidation": 15.994915,
+    "Phospho": PHOSPHO_MOD_MASS,
+    "Oxidation": OXIDATION_MASS,
 }
 
 # Decoy amino acid mapping
@@ -186,10 +171,6 @@
     "Y": "F",  # Tyr -> Phe
 }
 
-# New constants
-PHOSPHO_MOD_MASS = 79.966331
-OXIDATION_MASS = 15.994915
-
 # Character types
 SINGLE_CHAR = 0
 
@@ -200,6 +181,6 @@
 # Minimum values
 MIN_NUM_NEG_PKS = 50000
 
-# Physical constants
-WATER = 18.01056
-PROTON = 1.00728
+# Physical constants (aliases for backward compatibility)
+WATER = WATER_MASS
+PROTON = PROTON_MASS