Skip to content

Commit 95d01d2

Browse files
authored
Merge pull request #14 from CompOmics/feat/ms2pip-improvements
Speed up annotation and theoretical m/z calculation for MS²PIP
2 parents 1a28385 + 4829ea4 commit 95d01d2

11 files changed

Lines changed: 386 additions & 347 deletions

File tree

Cargo.lock

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[package]
22
name = "ms2rescore-rs"
3-
version = "0.5.0-alpha.2"
3+
version = "0.5.0-alpha.3"
44
edition = "2021"
55

66
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

src/annotation.rs

Lines changed: 76 additions & 80 deletions
Original file line numberDiff line numberDiff line change
@@ -7,14 +7,11 @@ use rayon::prelude::*;
77

88
use crate::types::annotation::{AnnotatedMS2Spectrum, FragmentAnnotation};
99
use crate::types::ms2_spectrum::MS2Spectrum;
10-
use crate::utils::parse_fragment;
10+
use crate::utils::{build_theoretical_fragments, search_sorted_mz};
1111

12-
use ordered_float::OrderedFloat;
1312
use rustyms::annotation::model::FragmentationModel;
14-
use rustyms::annotation::AnnotatableSpectrum;
1513
use rustyms::chemistry::MassMode;
1614
use rustyms::prelude::CompoundPeptidoformIon;
17-
use rustyms::spectrum::{PeakSpectrum, RawPeak, RawSpectrum};
1815
use rustyms::system::f64::MassOverCharge;
1916
use rustyms::system::mass_over_charge::thomson;
2017

@@ -58,21 +55,19 @@ fn parse_tolerance(
5855

5956
/// Annotate MS2 spectra with theoretical fragment ions.
6057
#[pyfunction]
61-
#[allow(clippy::too_many_arguments)]
6258
pub fn annotate_ms2_spectra(
6359
py: Python<'_>,
6460
spectra: Vec<Py<MS2Spectrum>>,
6561
proformas: Vec<String>,
66-
seq_lens: Vec<usize>,
6762
fragmentation_model: String,
6863
mass_mode: String,
6964
tolerance_value: f64,
7065
tolerance_mode: String,
7166
) -> PyResult<Vec<AnnotatedMS2Spectrum>> {
7267
let n = spectra.len();
73-
if proformas.len() != n || seq_lens.len() != n {
68+
if proformas.len() != n {
7469
return Err(PyException::new_err(
75-
"Input arrays must have identical length: spectra, proformas, seq_lens",
70+
"Input arrays must have identical length: spectra, proformas",
7671
));
7772
}
7873

@@ -83,7 +78,6 @@ pub fn annotate_ms2_spectra(
8378
mz_f32: Vec<f32>,
8479
intensity_f32: Vec<f32>,
8580
precursor: Option<crate::types::precursor::Precursor>,
86-
seq_len: usize,
8781
precursor_charge: i32,
8882
proforma: String,
8983
}
@@ -106,58 +100,77 @@ pub fn annotate_ms2_spectra(
106100
let spec_ref = spectra[i].bind(py);
107101
let spec = spec_ref.borrow();
108102

103+
let proforma = proformas[i]
104+
.split('/')
105+
.next()
106+
.unwrap_or(&proformas[i])
107+
.to_string();
108+
109109
owned.push(OwnedSpec {
110110
id: spec.identifier.clone(),
111111
mz_f32: spec.mz.clone(),
112112
intensity_f32: spec.intensity.clone(),
113113
precursor: spec.precursor.clone(),
114-
seq_len: seq_lens[i],
115114
precursor_charge: spec
116115
.precursor
117116
.as_ref()
118117
.map(|p| p.charge as i32)
119118
.unwrap_or(0),
120-
proforma: proformas[i]
121-
.split('/')
122-
.next()
123-
.unwrap_or(&proformas[i])
124-
.to_string(),
119+
proforma,
125120
});
126121
}
127122

128123
let model = parse_fragmentation_model(&fragmentation_model)?;
129124
let mode = parse_mass_mode(&mass_mode)?;
130125
let tolerance = parse_tolerance(tolerance_value, &tolerance_mode)?;
131-
let params = rustyms::annotation::model::MatchingParameters::default().tolerance(tolerance);
132-
133-
// Precompute theoretical fragments and parsed peptides per unique peptide+charge
134-
type CacheEntry = (CompoundPeptidoformIon, Vec<rustyms::fragment::Fragment>);
135-
136-
let mut frag_cache: HashMap<(String, i32), Arc<Option<CacheEntry>>> = HashMap::new();
137-
for item in &owned {
138-
let key = (item.proforma.clone(), item.precursor_charge);
139-
if item.precursor_charge <= 0 || frag_cache.contains_key(&key) {
140-
continue;
141-
}
142-
143-
let entry = CompoundPeptidoformIon::pro_forma(&item.proforma, None)
144-
.ok()
145-
.map(|peptide| {
146-
let frag_charge = rustyms::system::isize::Charge::new::<rustyms::system::e>(
147-
item.precursor_charge as isize,
148-
);
149-
let frags = peptide.generate_theoretical_fragments(frag_charge, &model);
150-
(peptide, frags)
151-
});
152-
153-
frag_cache.insert(key, Arc::new(entry));
126+
// Precompute theoretical fragments per unique peptidoform+charge
127+
struct CacheEntry {
128+
fragments: Vec<crate::utils::CachedFragment>,
129+
seq_len: usize,
154130
}
131+
type FragCache = Arc<HashMap<(String, i32), Arc<Option<CacheEntry>>>>;
132+
133+
let unique_keys: Vec<(String, i32)> = {
134+
let mut seen = HashMap::new();
135+
for item in &owned {
136+
if item.precursor_charge > 0 {
137+
seen.entry((item.proforma.clone(), item.precursor_charge))
138+
.or_insert(());
139+
}
140+
}
141+
seen.into_keys().collect()
142+
};
155143

156-
let frag_cache = Arc::new(frag_cache);
157-
let params = Arc::new(params);
158-
159-
// Release GIL and parallelize
144+
// Release GIL and parallelize both cache building and annotation
160145
let results: Result<Vec<AnnotatedMS2Spectrum>, String> = py.detach(|| {
146+
let frag_cache: FragCache = Arc::new(
147+
unique_keys
148+
.into_par_iter()
149+
.map(|(proforma, charge)| {
150+
let entry = CompoundPeptidoformIon::pro_forma(&proforma, None)
151+
.ok()
152+
.map(|peptidoform| {
153+
let seq_len = peptidoform
154+
.peptidoforms()
155+
.next()
156+
.map(|pf| pf.sequence().len())
157+
.unwrap_or(0);
158+
let frag_charge =
159+
rustyms::system::isize::Charge::new::<rustyms::system::e>(
160+
charge as isize,
161+
);
162+
let fragments =
163+
build_theoretical_fragments(&peptidoform, frag_charge, &model, mode);
164+
CacheEntry {
165+
fragments,
166+
seq_len,
167+
}
168+
});
169+
((proforma, charge), Arc::new(entry))
170+
})
171+
.collect(),
172+
);
173+
161174
owned
162175
.into_par_iter()
163176
.map(|item| {
@@ -168,53 +181,36 @@ pub fn annotate_ms2_spectra(
168181
));
169182
}
170183

171-
if item.mz_f32.is_empty() || item.seq_len == 0 || item.precursor_charge <= 0 {
184+
if item.mz_f32.is_empty() || item.precursor_charge <= 0 {
172185
return Ok(item.empty_annotated());
173186
}
174187

175188
let key = (item.proforma.clone(), item.precursor_charge);
176189
let cache_entry = frag_cache.get(&key).and_then(|e| e.as_ref().as_ref());
177190

178-
let (peptide, frags) = match cache_entry {
179-
Some((p, f)) if !f.is_empty() => (p, f),
191+
let entry = match cache_entry {
192+
Some(e) if e.seq_len > 0 && !e.fragments.is_empty() => e,
180193
_ => return Ok(item.empty_annotated()),
181194
};
182195

183-
// Build RawSpectrum from f32 data (convert to f64 in-place, no pre-stored copy)
184-
let mut spectrum = RawSpectrum::default();
185-
spectrum.title = item.id.clone();
186-
spectrum.num_scans = 1;
187-
188-
let peaks: Vec<RawPeak> = item
189-
.mz_f32
190-
.iter()
191-
.zip(item.intensity_f32.iter())
192-
.map(|(&mz, &inten)| RawPeak {
193-
mz: MassOverCharge::new::<thomson>(mz as f64),
194-
intensity: OrderedFloat(inten as f64),
195-
})
196-
.collect();
197-
198-
spectrum.extend(peaks);
199-
200-
let annotated = spectrum.annotate(peptide.clone(), frags.as_slice(), &params, mode);
201-
202-
let peak_annotations: Vec<Vec<FragmentAnnotation>> = annotated
203-
.spectrum()
204-
.map(|peak| {
205-
peak.annotation
206-
.iter()
207-
.filter_map(|frag| {
208-
let (series, position, charge) = parse_fragment(frag)?;
209-
Some(FragmentAnnotation {
210-
series: series.to_string(),
211-
position,
212-
charge,
213-
})
214-
})
215-
.collect()
216-
})
217-
.collect();
196+
let mz_range = MassOverCharge::new::<thomson>(0.0)
197+
..=MassOverCharge::new::<thomson>(f64::MAX);
198+
let mut peak_annotations = vec![Vec::new(); item.mz_f32.len()];
199+
200+
for frag in &entry.fragments {
201+
let fragment_mz = MassOverCharge::new::<thomson>(frag.mz);
202+
if !mz_range.contains(&fragment_mz) {
203+
continue;
204+
}
205+
206+
if let Some(idx) = search_sorted_mz(&item.mz_f32, frag.mz, &tolerance) {
207+
peak_annotations[idx].push(FragmentAnnotation {
208+
series: frag.series.to_string(),
209+
position: frag.position,
210+
charge: frag.charge,
211+
});
212+
}
213+
}
218214

219215
Ok(AnnotatedMS2Spectrum {
220216
identifier: item.id,

src/lib.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,5 +35,9 @@ fn ms2rescore_rs(m: &Bound<'_, PyModule>) -> PyResult<()> {
3535
ms2pip::theoretical_mz::ms2pip_compute_theoretical_mz,
3636
m
3737
)?)?;
38+
m.add_function(wrap_pyfunction!(
39+
ms2pip::targets::ms2pip_extract_targets,
40+
m
41+
)?)?;
3842
Ok(())
3943
}

src/ms2pip/feature_vectors.rs

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -48,14 +48,14 @@ fn quartiles_with_denom(sorted: &[u32], denom: usize) -> [u32; 5] {
4848
/// Extract amino acid indices and charge from a ProForma string.
4949
/// Returns (aa_indices, charge). Modified residues are mapped to their base AA.
5050
fn parse_proforma(proforma: &str) -> Result<(Vec<usize>, usize), String> {
51-
let compound = CompoundPeptidoformIon::pro_forma(proforma, None)
51+
let peptidoform = CompoundPeptidoformIon::pro_forma(proforma, None)
5252
.map_err(|e| format!("Failed to parse ProForma '{proforma}': {e}"))?;
5353

54-
let charge = extract_charge(&compound)
54+
let charge = extract_charge(&peptidoform)
5555
.ok_or_else(|| format!("No charge state found in '{proforma}'. MS2PIP requires a charge (e.g. 'PEPTIDE/2')."))?;
5656

5757
// Extract amino acid sequence — must be exactly one peptidoform
58-
let peptidoform_ions = compound.peptidoform_ions();
58+
let peptidoform_ions = peptidoform.peptidoform_ions();
5959
if peptidoform_ions.len() != 1 {
6060
return Err(format!(
6161
"Expected exactly 1 peptidoform ion in '{proforma}', found {}",
@@ -64,8 +64,8 @@ fn parse_proforma(proforma: &str) -> Result<(Vec<usize>, usize), String> {
6464
}
6565

6666
let mut aa_indices = Vec::new();
67-
for peptidoform in compound.peptidoforms() {
68-
for pos in peptidoform.sequence() {
67+
for peptide in peptidoform.peptidoforms() {
68+
for pos in peptide.sequence() {
6969
let aa = pos.aminoacid.aminoacid();
7070
let idx = aa_to_ms2pip_index(aa).ok_or_else(|| {
7171
format!("Unsupported amino acid '{aa}' in '{proforma}'")

src/ms2pip/mod.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,3 @@
11
pub mod feature_vectors;
2+
pub mod targets;
23
pub mod theoretical_mz;

0 commit comments

Comments
 (0)