Skip to content

Commit c883155

Browse files
trishortsMICHAEL SHORTREEDacesnik
authored
Proteoform bifurcation when reading xml with features (chain, signal, etc.) (#609)
* correct Within calculation * update unit tests * join products and cleave once products * unit tests for single cleavage products * remove unused test * fix broken unit tests * fix failing unit tests * add biomarkers during protein db load and revert add during digest * auto add biomarkers during protein database load if checked and revert biomarkers during digest * do not write biomarkers to xml database * default to retain methionine in biomarker generation * unit testage * increase unit test coverge * even more unit test coverage * more unit tests * codemaid * addressedReviewerComments Co-authored-by: MICHAEL SHORTREED <mrshortreed@wisc.edu> Co-authored-by: acesnik <anthony.cesnik@gmail.com>
1 parent d4348df commit c883155

File tree

12 files changed

+2118
-200
lines changed

12 files changed

+2118
-200
lines changed

mzLib/Proteomics/Protein/Protein.cs

Lines changed: 66 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ public Protein(string sequence, string accession, string organism = null, List<T
3333
IDictionary<int, List<Modification>> oneBasedModifications = null, List<ProteolysisProduct> proteolysisProducts = null,
3434
string name = null, string fullName = null, bool isDecoy = false, bool isContaminant = false, List<DatabaseReference> databaseReferences = null,
3535
List<SequenceVariation> sequenceVariations = null, List<SequenceVariation> appliedSequenceVariations = null, string sampleNameForVariants = null,
36-
List<DisulfideBond> disulfideBonds = null, List<SpliceSite> spliceSites = null, string databaseFilePath = null)
36+
List<DisulfideBond> disulfideBonds = null, List<SpliceSite> spliceSites = null, string databaseFilePath = null, bool addBiomarkers = false)
3737
{
3838
// Mandatory
3939
BaseSequence = sequence;
@@ -64,6 +64,10 @@ public Protein(string sequence, string accession, string organism = null, List<T
6464
DatabaseReferences = databaseReferences ?? new List<DatabaseReference>();
6565
DisulfideBonds = disulfideBonds ?? new List<DisulfideBond>();
6666
SpliceSites = spliceSites ?? new List<SpliceSite>();
67+
if (addBiomarkers)
68+
{
69+
this.AddBiomarkers();
70+
}
6771
}
6872

6973
/// <summary>
@@ -158,9 +162,9 @@ public Protein(string variantBaseSequence, Protein protein, IEnumerable<Sequence
158162

159163
//TODO: Generate all the proteolytic products as distinct proteins during XML reading and delete the ProteolysisProducts parameter
160164
public IEnumerable<ProteolysisProduct> ProteolysisProducts
161-
{ get { return _proteolysisProducts; } }
165+
{ get { return _proteolysisProducts; } }
162166

163-
public IEnumerable<DatabaseReference> DatabaseReferences { get; }
167+
public IEnumerable<DatabaseReference> DatabaseReferences { get; }
164168
public string DatabaseFilePath { get; }
165169

166170
/// <summary>
@@ -244,14 +248,11 @@ public IEnumerable<PeptideWithSetModifications> Digest(DigestionParams digestion
244248
variableModifications = variableModifications ?? new List<Modification>();
245249
CleavageSpecificity searchModeType = digestionParams.SearchModeType;
246250

247-
ProteinDigestion digestion = new ProteinDigestion(digestionParams, allKnownFixedModifications, variableModifications);
248-
251+
ProteinDigestion digestion = new(digestionParams, allKnownFixedModifications, variableModifications);
249252
IEnumerable<ProteolyticPeptide> unmodifiedPeptides =
250-
digestionParams.Protease.Name == "top-down biomarker" ?
251-
digestion.Digestion(this) :
252253
searchModeType == CleavageSpecificity.Semi ?
253254
digestion.SpeedySemiSpecificDigestion(this) :
254-
digestion.Digestion(this);
255+
digestion.Digestion(this);
255256

256257
if (digestionParams.KeepNGlycopeptide || digestionParams.KeepOGlycopeptide)
257258
{
@@ -635,7 +636,6 @@ public void AddBiomarkersToProteolysisProducts(int fullProteinOneBasedBegin, int
635636
AddNterminalBiomarkers(lengthOfProteolysis, fullProteinOneBasedBegin, fullProteinOneBasedEnd, minProductBaseSequenceLength, proteolyisisProductName);
636637
}
637638
}
638-
639639
}
640640
else // initiator methionine cleavage is variable we have to deal both with keeping and deleting the M
641641
{
@@ -646,15 +646,13 @@ public void AddBiomarkersToProteolysisProducts(int fullProteinOneBasedBegin, int
646646
{
647647
AddNterminalBiomarkers(lengthOfProteolysis + 1, fullProteinOneBasedBegin, fullProteinOneBasedEnd, minProductBaseSequenceLength, proteolyisisProductName);
648648
}
649-
650649
}
651650
//Digest C-terminus -- not effected by variable N-terminus behavior
652651
if (addCterminalDigestionBiomarkers)
653652
{
654653
AddCterminalBiomarkers(lengthOfProteolysis, fullProteinOneBasedEnd, fullProteinOneBasedBegin, minProductBaseSequenceLength, proteolyisisProductName);
655654
}
656655
}
657-
658656
}
659657
else // sequence does not contain N-terminus
660658
{
@@ -712,7 +710,7 @@ private void AddNterminalBiomarkers(int lengthOfProteolysis, int fullProteinOneB
712710
/// <param name="initiatorMethionineBehavior"> this effects the intact proteoform as well as any original proteolysis products containing the N-terminus</param>
713711
/// <param name="minProductBaseSequenceLength"> the same as the min detectable peptide</param>
714712
/// <param name="lengthOfProteolysis"> the number of amino acids that can be removed from either end.</param>
715-
public void AddBiomarkers(bool addFullProtein, bool addForEachOrigninalProteolysisProduct, bool addNterminalDigestionBiomarkers, bool addCterminalDigestionBiomarkers, InitiatorMethionineBehavior initiatorMethionineBehavior, int minProductBaseSequenceLength, int lengthOfProteolysis)
713+
public void AddBiomarkers(bool addFullProtein = true, bool addForEachOrigninalProteolysisProduct = true, bool addNterminalDigestionBiomarkers = true, bool addCterminalDigestionBiomarkers = true, InitiatorMethionineBehavior initiatorMethionineBehavior = InitiatorMethionineBehavior.Retain, int minProductBaseSequenceLength = 7, int lengthOfProteolysis = 5)
716714
{
717715
if (addFullProtein) //this loop adds the intact protoeoform and its proteolysis products to the proteolysis products list
718716
{
@@ -729,8 +727,8 @@ public void AddBiomarkers(bool addFullProtein, bool addForEachOrigninalProteolys
729727

730728
if (addForEachOrigninalProteolysisProduct) // this does not include the original intact proteoform
731729
{
732-
RemoveMethionineWhenAppropriateFromExistingProduts(initiatorMethionineBehavior);
733-
List<ProteolysisProduct> existingProducts = ProteolysisProducts.Where(p => !p.Type.Contains("biomarker") && !p.Type.Contains("intact")).ToList();
730+
RemoveMethionineWhenAppropriateFromExistingProduts(initiatorMethionineBehavior);
731+
List<ProteolysisProduct> existingProducts = ProteolysisProducts.Where(p => !p.Type.Contains("biomarker") && !p.Type.Contains("intact")).ToList();
734732
foreach (ProteolysisProduct product in existingProducts)
735733
{
736734
if (product.OneBasedBeginPosition.HasValue && product.OneBasedEndPosition.HasValue)
@@ -753,14 +751,15 @@ public void AddBiomarkers(bool addFullProtein, bool addForEachOrigninalProteolys
753751
}
754752
}
755753
}
754+
CleaveOnceBetweenProteolysisProducts();
756755
}
757756

758757
/// <summary>
759758
/// When a protein has existing proteolysis products, we have to remove methionine when appropriate before creating additional proteolysis products
760759
/// </summary>
761760
/// <param name="existingProducts"></param>
762761
/// <param name="initiatorMethionineBehavior"></param>
763-
private void RemoveMethionineWhenAppropriateFromExistingProduts(InitiatorMethionineBehavior initiatorMethionineBehavior)
762+
public void RemoveMethionineWhenAppropriateFromExistingProduts(InitiatorMethionineBehavior initiatorMethionineBehavior)
764763
{
765764
List<ProteolysisProduct> productsAtNterminusWithMethionine = _proteolysisProducts.Where(p => !p.Type.Contains("biomarker") && !p.Type.Contains("intact") && p.OneBasedBeginPosition == 1).ToList();
766765

@@ -784,37 +783,82 @@ private void RemoveMethionineWhenAppropriateFromExistingProduts(InitiatorMethion
784783
{
785784
//here we don't want to do anything, we leave in the products with begin position = 1. Later we'll add an additional proteolysis product so that we get the right number
786785
}
787-
788786
}
789787
}
790788
}
791789
}
792790

793-
private void AddIntactProteoformToProteolysisProducts(InitiatorMethionineBehavior initiatorMethionineBehavior, int minProductBaseSequenceLength)
791+
public void AddIntactProteoformToProteolysisProducts(InitiatorMethionineBehavior initiatorMethionineBehavior, int minProductBaseSequenceLength)
794792
{
795793
if (initiatorMethionineBehavior == InitiatorMethionineBehavior.Retain || initiatorMethionineBehavior == InitiatorMethionineBehavior.Variable)
796794
{
797795
//when it's variable, we don't have to add anything here, we'll get an additonal proteolysis product later.
798-
if(BaseSequence.Length >= minProductBaseSequenceLength)
796+
if (BaseSequence.Length >= minProductBaseSequenceLength)
799797
{
800-
_proteolysisProducts.Add(new ProteolysisProduct(1, BaseSequence.Length, "intact proteoform"));
798+
_proteolysisProducts.Add(new ProteolysisProduct(1, BaseSequence.Length, "intact proteoform biomarker"));
801799
}
802-
803800
}
804801
else if (initiatorMethionineBehavior == InitiatorMethionineBehavior.Cleave)
805802
{
806803
if (BaseSequence.Substring(0, 1) == "M")
807804
{
808805
if (BaseSequence.Length - 1 >= minProductBaseSequenceLength)
809806
{
810-
_proteolysisProducts.Add(new ProteolysisProduct(2, BaseSequence.Length, "intact proteoform"));
807+
_proteolysisProducts.Add(new ProteolysisProduct(2, BaseSequence.Length, "intact proteoform biomarker"));
811808
}
812809
}
813810
else
814811
{
815812
if (BaseSequence.Length >= minProductBaseSequenceLength)
816813
{
817-
_proteolysisProducts.Add(new ProteolysisProduct(1, BaseSequence.Length, "intact proteoform"));
814+
_proteolysisProducts.Add(new ProteolysisProduct(1, BaseSequence.Length, "intact proteoform biomarker"));
815+
}
816+
}
817+
}
818+
}
819+
820+
/// <summary>
821+
/// proteins with multiple proteolysis products are not always full cleaved. we observed proteolysis products w/ missed cleavages.
822+
/// This method allows for one missed cleavage between proteolysis products.
823+
/// </summary>
824+
/// <param name="minimumProductLength"></param>
825+
public void CleaveOnceBetweenProteolysisProducts(int minimumProductLength = 7)
826+
{
827+
List<int> cleavagePostions = new();
828+
List<int> proteolysisProductEndPositions = _proteolysisProducts.Where(p => p.OneBasedEndPosition.HasValue).Select(p => p.OneBasedEndPosition.Value).ToList();
829+
830+
if (proteolysisProductEndPositions.Count > 0)
831+
{
832+
foreach (int proteolysisProductEndPosition in proteolysisProductEndPositions)
833+
{
834+
if (_proteolysisProducts.Any(p => p.OneBasedBeginPosition == (proteolysisProductEndPosition + 1)))
835+
{
836+
cleavagePostions.Add(proteolysisProductEndPosition);
837+
}
838+
}
839+
}
840+
841+
foreach (int position in cleavagePostions)
842+
{
843+
if (position - 1 >= minimumProductLength)
844+
{
845+
string leftType = $"N-terminal Portion of Singly Cleaved Protein(1-{position})";
846+
ProteolysisProduct leftProduct = new(1, position, leftType);
847+
//here we're making sure a product with these begin/end positions isn't already present
848+
if (!_proteolysisProducts.Any(p => p.OneBasedBeginPosition == leftProduct.OneBasedBeginPosition && p.OneBasedEndPosition == leftProduct.OneBasedEndPosition))
849+
{
850+
_proteolysisProducts.Add(leftProduct);
851+
}
852+
}
853+
854+
if (BaseSequence.Length - position - 1 >= minimumProductLength)
855+
{
856+
string rightType = $"C-terminal Portion of Singly Cleaved Protein({position + 1}-{BaseSequence.Length})";
857+
ProteolysisProduct rightProduct = new(position + 1, BaseSequence.Length, rightType);
858+
//here we're making sure a product with these begin/end positions isn't already present
859+
if (!_proteolysisProducts.Any(p => p.OneBasedBeginPosition == rightProduct.OneBasedBeginPosition && p.OneBasedEndPosition == rightProduct.OneBasedEndPosition))
860+
{
861+
_proteolysisProducts.Add(rightProduct);
818862
}
819863
}
820864
}

mzLib/Proteomics/Protein/ProteolysisProduct.cs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,8 @@ public override bool Equals(object obj)
2424

2525
public override int GetHashCode()
2626
{
27-
return (OneBasedBeginPosition ?? 0).GetHashCode()
28-
^ (OneBasedEndPosition ?? 0).GetHashCode()
27+
return (OneBasedBeginPosition ?? 0).GetHashCode()
28+
^ (OneBasedEndPosition ?? 0).GetHashCode()
2929
^ Type.GetHashCode(); // null handled in constructor
3030
}
3131
}

0 commit comments

Comments
 (0)