Skip to content

Commit b83de70

Browse files
Created factory method for MbrScorer (#887)
* revised the RevisedModifiedPeptides function * revised the run error checking * add the tester for merge * Edit the tester for the new detectionType * fix the bug and set intensity to 0 for ambiguous peak * slove the ambiguous peak intensity problem * Fix the bug on MBR score when there are all ambiguous peak inside * change some comment * refactored mbr scorer constructoin * Edited mbrScorer test * Added check to avoid invalid distributions in MbrScorer * Broke some stuff * Fixed remaining issues * removed NaNs in ppm error calculations * Changes to scorer changed the real data MBR test. Test assert statements have been amended --------- Co-authored-by: RayMSMS <150720362+RayMSMS@users.noreply.github.com>
1 parent e786e2f commit b83de70

File tree

6 files changed

+169
-97
lines changed

6 files changed

+169
-97
lines changed

mzLib/FlashLFQ/FlashLFQResults.cs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -810,6 +810,7 @@ internal void RevisedModifiedPeptides()
810810
.DistinctBy(p=>p.ModifiedSequence)
811811
.Select(p=>p.ModifiedSequence)
812812
.ToList();
813+
813814
foreach (var modSeq in allIDs)
814815
{
815816
if (PeptideModifiedSequences.ContainsKey(modSeq))

mzLib/FlashLFQ/FlashLfqEngine.cs

Lines changed: 4 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -796,55 +796,6 @@ internal RtInfo PredictRetentionTime(
796796
return new RtInfo(predictedRt: donorPeak.Apex.IndexedPeak.RetentionTime - medianRtDiff, width: rtRange);
797797
}
798798

799-
/// <summary>
800-
/// Constructs a MbrScorer object that is used to score all MBR peaks for a given acceptor file
801-
/// </summary>
802-
/// <param name="acceptorFileIdentifiedPeaks"> All MSMS identified peaks in the acceptor file </param>
803-
/// <param name="fileSpecificMbrTolerance">A ppm tolerance specific to the given file</param>
804-
/// <returns> A MbrScorer object </returns>
805-
private MbrScorer BuildMbrScorer(List<ChromatographicPeak> acceptorFileIdentifiedPeaks, out PpmTolerance fileSpecificMbrTolerance)
806-
{
807-
// Construct a distribution of ppm errors for all MSMS peaks in the acceptor file
808-
var apexToAcceptorFilePeakDict = new Dictionary<IIndexedPeak, ChromatographicPeak>();
809-
List<double> ppmErrors = new List<double>();
810-
foreach (var peak in acceptorFileIdentifiedPeaks.Where(p => p.Apex != null
811-
&& PeptideModifiedSequencesToQuantify.Contains(p.Identifications.First().ModifiedSequence)
812-
&& p.Identifications.First().QValue < FlashParams.DonorQValueThreshold))
813-
{
814-
if (!apexToAcceptorFilePeakDict.ContainsKey(peak.Apex.IndexedPeak))
815-
{
816-
apexToAcceptorFilePeakDict.Add(peak.Apex.IndexedPeak, peak);
817-
}
818-
819-
ppmErrors.Add(peak.MassError);
820-
}
821-
if (ppmErrors.Count < 3)
822-
{
823-
fileSpecificMbrTolerance = null;
824-
return null;
825-
}
826-
double ppmSpread = ppmErrors.Count > 30 ? ppmErrors.InterquartileRange() / 1.36 : ppmErrors.StandardDeviation();
827-
Normal ppmDistribution = new Normal(ppmErrors.Median(), ppmSpread);
828-
double fileSpecificMbrPpmTolerance = Math.Min(Math.Abs(ppmErrors.Median()) + ppmSpread * 4, FlashParams.MbrPpmTolerance);
829-
fileSpecificMbrTolerance = new PpmTolerance(fileSpecificMbrPpmTolerance); // match between runs PPM tolerance
830-
831-
// Construct a distribution of peak log intensities for all MSMS peaks in the acceptor file
832-
var acceptorFileLogIntensities = acceptorFileIdentifiedPeaks
833-
.Where(p => p.Intensity > 0)
834-
.Select(p => Math.Log(p.Intensity, 2))
835-
.ToList();
836-
double medianAcceptorLogIntensity = acceptorFileLogIntensities.Median();
837-
Normal logIntensityDistribution = new Normal(acceptorFileLogIntensities.Median(), acceptorFileLogIntensities.InterquartileRange() / 1.36);
838-
try // if the constructor fails, we don't want to crash the program
839-
{
840-
return new MbrScorer(apexToAcceptorFilePeakDict, acceptorFileIdentifiedPeaks, ppmDistribution, logIntensityDistribution);
841-
}
842-
catch
843-
{
844-
return null;
845-
}
846-
}
847-
848799
/// <summary>
849800
/// Returns a pseudo-randomly selected peak that does not have the same mass as the donor
850801
/// </summary>
@@ -911,14 +862,16 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo acceptorFile)
911862
.Count() > 1;
912863

913864
// acceptor file known peaks
914-
var acceptorFileIdentifiedPeaks = _results.Peaks[acceptorFile];
865+
var acceptorFileIdentifiedPeaks = _results.Peaks[acceptorFile]
866+
.Where(p => p.Identifications.Any(id => PeptideModifiedSequencesToQuantify.Contains(id.ModifiedSequence)))
867+
.ToList();
915868

916869
// these are the analytes already identified in this run. we don't need to try to match them from other runs
917870
var acceptorFileIdentifiedSequences = new HashSet<string>(acceptorFileIdentifiedPeaks
918871
.Where(peak => peak.IsotopicEnvelopes.Any() && peak.Identifications.Min(id => id.QValue) < 0.01)
919872
.SelectMany(p => p.Identifications.Select(d => d.ModifiedSequence)));
920873

921-
MbrScorer scorer = BuildMbrScorer(acceptorFileIdentifiedPeaks, out var mbrTol);
874+
MbrScorer scorer = MbrScorerFactory.BuildMbrScorer(acceptorFileIdentifiedPeaks, FlashParams, out var mbrTol);
922875
if (scorer == null)
923876
return;
924877

mzLib/FlashLFQ/MBR/MbrScorer.cs

Lines changed: 91 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,13 @@
11
using Easy.Common.EasyComparer;
2+
using MassSpectrometry;
23
using MathNet.Numerics.Distributions;
34
using MathNet.Numerics.Statistics;
5+
using MzLibUtil;
46
using System;
57
using System.Collections.Generic;
68
using System.Data;
79
using System.Data.Entity.ModelConfiguration.Conventions;
810
using System.Linq;
9-
using MassSpectrometry;
1011

1112
namespace FlashLFQ
1213
{
@@ -17,10 +18,11 @@ namespace FlashLFQ
1718
internal class MbrScorer
1819
{
1920
// Intensity and ppm distributions are specific to each acceptor file
20-
private readonly Normal _logIntensityDistribution;
21-
private readonly Normal _ppmDistribution;
22-
private readonly Normal _scanCountDistribution;
23-
private readonly Gamma _isotopicCorrelationDistribution;
21+
private Normal _logIntensityDistribution;
22+
private Normal _ppmDistribution;
23+
private Normal _scanCountDistribution;
24+
private Gamma _isotopicCorrelationDistribution;
25+
2426
// The logFcDistributions and rtDifference distributions are unique to each donor file - acceptor file pair
2527
private Dictionary<SpectraFileInfo, Normal> _logFcDistributionDictionary;
2628
private Dictionary<SpectraFileInfo, Normal> _rtPredictionErrorDistributionDictionary;
@@ -35,23 +37,69 @@ internal class MbrScorer
3537
/// </summary>
3638
internal MbrScorer(
3739
Dictionary<IIndexedPeak, ChromatographicPeak> apexToAcceptorFilePeakDict,
38-
List<ChromatographicPeak> acceptorFileMsmsPeaks,
39-
Normal ppmDistribution,
40-
Normal logIntensityDistribution)
40+
List<ChromatographicPeak> unambiguousAcceptorFilePeaks)
4141
{
4242
ApexToAcceptorFilePeakDict = apexToAcceptorFilePeakDict;
43-
UnambiguousMsMsAcceptorPeaks = acceptorFileMsmsPeaks.Where(p => p.Apex != null && p.DetectionType != DetectionType.MBR && p.NumIdentificationsByFullSeq == 1).ToList();
44-
MaxNumberOfScansObserved = acceptorFileMsmsPeaks.Max(peak => peak.ScanCount);
45-
_logIntensityDistribution = logIntensityDistribution;
46-
_ppmDistribution = ppmDistribution;
47-
_isotopicCorrelationDistribution = GetIsotopicEnvelopeCorrDistribution();
43+
UnambiguousMsMsAcceptorPeaks = unambiguousAcceptorFilePeaks;
44+
MaxNumberOfScansObserved = unambiguousAcceptorFilePeaks.Max(peak => peak.ScanCount);
45+
46+
// Initialize the dictionaries that will hold the log fold change and RT prediction error distributions
47+
// for each donor file
4848
_logFcDistributionDictionary = new();
4949
_rtPredictionErrorDistributionDictionary = new();
50+
}
51+
52+
/// <summary>
53+
/// Constructs the distributions that are used to score MBR matches
54+
/// </summary>
55+
/// <returns>Returns true if the scorer was initialized successfully, false otherwise</returns>
56+
internal bool InitializeScorer()
57+
{
58+
if (UnambiguousMsMsAcceptorPeaks.Count < 3)
59+
return false;
60+
61+
// Populate distributions for scoring MBR matches
62+
_logIntensityDistribution = GetLogIntensityDistribution();
63+
_ppmDistribution = GetPpmErrorDistribution();
64+
_isotopicCorrelationDistribution = GetIsotopicEnvelopeCorrDistribution();
65+
_scanCountDistribution = GetScanCountDistribution();
66+
67+
return IsValid();
68+
}
69+
70+
private Normal GetPpmErrorDistribution()
71+
{
72+
// Construct a distribution of ppm errors for all MSMS peaks in the acceptor file
73+
List<double> ppmErrors = UnambiguousMsMsAcceptorPeaks.Select(p => p.MassError).Where(e => !double.IsNaN(e)).ToList();
74+
if (ppmErrors.Count < 2)
75+
return null;
76+
double ppmSpread = ppmErrors.Count > 30 ? ppmErrors.InterquartileRange() / 1.36 : ppmErrors.StandardDeviation();
77+
Normal ppmDistribution = new Normal(ppmErrors.Median(), ppmSpread);
78+
return ppmDistribution;
79+
}
80+
81+
private Normal GetLogIntensityDistribution()
82+
{
83+
var logIntensities = UnambiguousMsMsAcceptorPeaks
84+
.Where(p => p.Intensity > 0)
85+
.Select(p => Math.Log(p.Intensity, 2))
86+
.ToList();
87+
88+
if (logIntensities.Count < 2)
89+
return null;
90+
91+
double mean = logIntensities.Median();
92+
double stdDev = logIntensities.InterquartileRange() / 1.36;
93+
return new Normal(mean, stdDev);
94+
}
5095

51-
// This is kludgey, because scan counts are discrete
96+
// This is kludgey, because scan counts are discrete
97+
private Normal GetScanCountDistribution()
98+
{
5299
List<double> scanList = UnambiguousMsMsAcceptorPeaks.Select(peak => (double)peak.ScanCount).ToList();
100+
53101
// build a normal distribution for the scan list of the acceptor peaks
54-
_scanCountDistribution = new Normal(scanList.Average(), scanList.Count > 30 ? scanList.StandardDeviation() : scanList.InterquartileRange() / 1.36);
102+
return new Normal(scanList.Average(), scanList.Count > 30 ? scanList.StandardDeviation() : scanList.InterquartileRange() / 1.36);
55103
}
56104

57105
/// <summary>
@@ -61,11 +109,13 @@ internal MbrScorer(
61109
private Gamma GetIsotopicEnvelopeCorrDistribution()
62110
{
63111
var pearsonCorrs = UnambiguousMsMsAcceptorPeaks.Select(p => 1 - p.IsotopicPearsonCorrelation).Where(p => p > 0).ToList();
64-
if (pearsonCorrs.Count <= 1) return null;
112+
if (pearsonCorrs.Count < 2) return null;
65113
double mean = pearsonCorrs.Mean();
66114
double variance = pearsonCorrs.Variance();
67115
var alpha = Math.Pow(mean, 2) / variance;
68116
var beta = mean / variance;
117+
if (!Gamma.IsValidParameterSet(alpha, beta))
118+
return null;
69119
return new Gamma(alpha, beta);
70120
}
71121

@@ -98,20 +148,17 @@ internal void AddRtPredErrorDistribution(SpectraFileInfo donorFile, List<double>
98148
rtPredictionErrors.Add(avgDiff - anchorPeptideRtDiffs[i]);
99149
}
100150

101-
Normal rtPredictionErrorDist = new Normal(0, 0);
102151
// Default distribution. Effectively assigns a RT Score of zero if no alignment can be performed
103152
// between the donor and acceptor based on shared MS/MS IDs
104-
105-
if(rtPredictionErrors.Any())
153+
Normal rtPredictionErrorDist = new Normal(0, 0);
154+
if (rtPredictionErrors.Count >= 2)
106155
{
107156
double medianRtError = rtPredictionErrors.Median();
108157
double stdDevRtError = rtPredictionErrors.StandardDeviation();
109-
if(stdDevRtError >= 0.0 && !double.IsNaN(medianRtError))
110-
{
158+
if (Normal.IsValidParameterSet(medianRtError, stdDevRtError))
111159
rtPredictionErrorDist = new Normal(medianRtError, 1);
112-
}
113160
}
114-
161+
115162
_rtPredictionErrorDistributionDictionary.Add(donorFile, rtPredictionErrorDist);
116163
}
117164

@@ -139,6 +186,14 @@ internal double ScoreMbr(MbrChromatographicPeak acceptorPeak, ChromatographicPea
139186
* acceptorPeak.IsotopicDistributionScore, 0.20);
140187
}
141188

189+
/// <summary>
190+
/// Returns the standard deviation of the Ppm error distribution + the median of the Ppm error distribution
191+
/// </summary>
192+
internal double GetPpmErrorTolerance()
193+
{
194+
return _ppmDistribution.StdDev * 4 + Math.Abs(_ppmDistribution.Median);
195+
}
196+
142197
// Setting a minimum score prevents the MBR score from going to zero if one component of that score is 0
143198
// 3e-7 is the fraction of a normal distribution that lies at least 5 stdDev away from the mean
144199
private double _minScore = 3e-7;
@@ -189,10 +244,6 @@ internal double CalculateIntensityScore(double acceptorIntensity, Chromatographi
189244
/// <param name="idDonorPeaks"> List of peaks in the donoro file. </param>
190245
internal void CalculateFoldChangeBetweenFiles(List<ChromatographicPeak> idDonorPeaks)
191246
{
192-
193-
var donorFileLogIntensities = idDonorPeaks.Where(p => p.Intensity > 0).Select(p => Math.Log(p.Intensity, 2)).ToList();
194-
double medianDonorLogIntensity = donorFileLogIntensities.Median();
195-
196247
// Find the difference in peptide intensities between donor and acceptor files
197248
// this intensity score creates a conservative bias in MBR
198249
List<double> listOfFoldChangesBetweenTheFiles = new List<double>();
@@ -244,7 +295,19 @@ internal bool IsValid(SpectraFileInfo donorFile)
244295
{
245296
return _rtPredictionErrorDistributionDictionary.TryGetValue(donorFile, out var rtDist)
246297
&& rtDist != null
247-
&& _ppmDistribution != null
298+
&& IsValid();
299+
}
300+
301+
/// <summary>
302+
/// This method checks whether the scorer is validly parameterized and capable of scoring MBR transfers
303+
/// Notably, it is indifferent to the isotopic correlation distribution being null, as a null isotopic distribution correlation
304+
/// results in all MBR transfers receiving the minimum score for the isotopic distribution component.
305+
/// This could be changed in the future, but currently multiple tests results in null isotopic distributions, and will break if they can't do MBR
306+
/// </summary>
307+
/// <returns></returns>
308+
internal bool IsValid()
309+
{
310+
return _ppmDistribution != null
248311
&& _scanCountDistribution != null
249312
&& _logIntensityDistribution != null;
250313
}
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
using MassSpectrometry;
2+
using MathNet.Numerics.Distributions;
3+
using MathNet.Numerics.Statistics;
4+
using MzLibUtil;
5+
using System;
6+
using System.Collections.Generic;
7+
using System.Linq;
8+
using System.Text;
9+
using System.Threading.Tasks;
10+
11+
namespace FlashLFQ
12+
{
13+
internal static class MbrScorerFactory
14+
{
15+
/// <summary>
16+
/// Constructs a MbrScorer object that is used to score all MBR peaks for a given acceptor file
17+
/// </summary>
18+
/// <param name="acceptorFileMsmsPeaks"> All MSMS identified peaks in the acceptor file that contain quantifiable peptides </param>
19+
/// <param name="fileSpecificMbrTolerance">A ppm tolerance specific to the given file</param>
20+
/// <returns> A MbrScorer object </returns>
21+
public static MbrScorer BuildMbrScorer(List<ChromatographicPeak> acceptorFileMsmsPeaks,
22+
FlashLfqParameters flashParams, out PpmTolerance fileSpecificMbrTolerance)
23+
{
24+
// Construct a dictionary linking each MSMS peaks to the indexed peak of its apex.
25+
// This is to ensure MBR doesn't assign a peptide to a peak that is already claimed by one or more other peptides.
26+
var apexToAcceptorFilePeakDict = acceptorFileMsmsPeaks
27+
.Where(p => p.Apex != null)
28+
.DistinctBy(p => p.Apex.IndexedPeak)
29+
.ToDictionary(p => p.Apex.IndexedPeak, p => p);
30+
31+
MbrScorer scorer = new MbrScorer(apexToAcceptorFilePeakDict, acceptorFileMsmsPeaks);
32+
// Try to initialize the scorer, which will fail if the acceptor file is empty or has no MSMS peaks.
33+
if (!scorer.InitializeScorer())
34+
{
35+
fileSpecificMbrTolerance = null;
36+
return null;
37+
}
38+
39+
double mbrPpmTolerance = Math.Min(scorer.GetPpmErrorTolerance(), flashParams.MbrPpmTolerance);
40+
fileSpecificMbrTolerance = new PpmTolerance(mbrPpmTolerance); // match between runs PPM tolerance
41+
return scorer;
42+
}
43+
44+
}
45+
}

mzLib/TestFlashLFQ/TestFlashLFQ.cs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1512,8 +1512,8 @@ public static void RealDataMbrTest()
15121512
// Any change to ML.NET or the PEP Analysis engine will cause these to change.
15131513
Console.WriteLine("r1 PIP event count: " + f1r1MbrResults.Count);
15141514
Console.WriteLine("r2 PIP event count: " + f1r2MbrResults.Count);
1515-
Assert.AreEqual(141, f1r1MbrResults.Count);
1516-
Assert.AreEqual(77, f1r2MbrResults.Count);
1515+
Assert.AreEqual(140, f1r1MbrResults.Count);
1516+
Assert.AreEqual(78, f1r2MbrResults.Count);
15171517

15181518
// Check that MS/MS identified peaks and MBR identified peaks have similar intensities
15191519
List<(double, double)> peptideIntensities = f1r1MbrResults.Select(pep => (Math.Log(pep.Value.GetIntensity(f1r1)), Math.Log(pep.Value.GetIntensity(f1r2)))).ToList();

0 commit comments

Comments
 (0)