@@ -1174,7 +1174,7 @@ public PeptideWithSetModifications GetReverseDecoyFromTarget(int[] revisedAminoA
11741174 foreach ( int location in cleavageMotifLocations )
11751175 {
11761176 char [ ] motifArray = BaseSequence . Substring ( location , cleavingMotif . Length ) . ToCharArray ( ) ;
1177-
1177+
11781178 for ( int i = 0 ; i < cleavingMotif . Length ; i ++ )
11791179 {
11801180 newBase [ location + i ] = motifArray [ i ] ;
@@ -1191,8 +1191,9 @@ public PeptideWithSetModifications GetReverseDecoyFromTarget(int[] revisedAminoA
11911191 }
11921192 }
11931193
1194- //We've kept amino acids in the digestion motif in the same position in the decoy peptide.
1195- //Now we will fill the remaining open positions in the decoy with the reverse of amino acids from the target.
1194+ // We've kept amino acids in the digestion motif in the same position in the decoy peptide.
1195+ // Now we will fill the remaining open positions in the decoy with the reverse of amino acids from the target.
1196+ // Part to change to scramble
11961197 int fillPosition = 0 ;
11971198 int extractPosition = this . BaseSequence . Length - 1 ;
11981199 while ( fillPosition < this . BaseSequence . Length && extractPosition >= 0 )
@@ -1250,7 +1251,222 @@ public PeptideWithSetModifications GetReverseDecoyFromTarget(int[] revisedAminoA
12501251 }
12511252
12521253 }
1254+ /// <summary>
1255+ /// This function generates a decoy peptide from a target by scrambling the target peptide's amino acid sequence
1256+ /// This preserves any digestion motifs and keeps modifications with their amino acids
1257+ /// To help generate only high quality decoys, a homology cutoff of 30 % sequence similarity is used
1258+ /// If after 10 attempts no sufficient decoy is generated, the mirror sequence is returned
1259+ /// </summary>
1260+ /// <param name="revisedAminoAcidOrder">Array to store the new amino acid order in</param>
1261+ /// <param name="maximumHomology">Parameter specifying the homology cutoff to be used</param>
1262+ /// <returns></returns>
1263+ public PeptideWithSetModifications GetScrambledDecoyFromTarget ( int [ ] revisedAminoAcidOrder , double maximumHomology = 0.3 )
1264+ {
1265+ Dictionary < int , Modification > newModificationsDictionary = new Dictionary < int , Modification > ( ) ;
1266+ //Copy N-terminal modifications from target dictionary to decoy dictionary.
1267+ if ( this . AllModsOneIsNterminus . ContainsKey ( 1 ) )
1268+ {
1269+ newModificationsDictionary . Add ( 1 , this . AllModsOneIsNterminus [ 1 ] ) ;
1270+ }
1271+ char [ ] newBase = new char [ this . BaseSequence . Length ] ;
1272+ Array . Fill ( newBase , '0' ) ;
1273+ char [ ] evaporatingBase = this . BaseSequence . ToCharArray ( ) ;
1274+ List < DigestionMotif > motifs = this . DigestionParams . Protease . DigestionMotifs ;
1275+ if ( motifs != null && motifs . Count > 0 )
1276+ {
1277+ foreach ( var motif in motifs . Where ( m => m . InducingCleavage != "" ) ) //check the empty "" for topdown
1278+ {
1279+ string cleavingMotif = motif . InducingCleavage ;
1280+ List < int > cleavageMotifLocations = new List < int > ( ) ;
1281+
1282+ for ( int i = 0 ; i < BaseSequence . Length ; i ++ )
1283+ {
1284+ bool fits ;
1285+ bool prevents ;
1286+ ( fits , prevents ) = motif . Fits ( BaseSequence , i ) ;
1287+
1288+ if ( fits && ! prevents )
1289+ {
1290+ cleavageMotifLocations . Add ( i ) ;
1291+ }
1292+ }
1293+
1294+ foreach ( int location in cleavageMotifLocations )
1295+ {
1296+ char [ ] motifArray = BaseSequence . Substring ( location , cleavingMotif . Length ) . ToCharArray ( ) ;
1297+
1298+ for ( int i = 0 ; i < cleavingMotif . Length ; i ++ )
1299+ {
1300+ newBase [ location + i ] = motifArray [ i ] ;
1301+ revisedAminoAcidOrder [ location + i ] = location + i ;
1302+ //directly copy mods that were on amino acids in the motif. Those amino acids don't change position.
1303+ if ( this . AllModsOneIsNterminus . ContainsKey ( location + i + 2 ) )
1304+ {
1305+ newModificationsDictionary . Add ( location + i + 2 , this . AllModsOneIsNterminus [ location + i + 2 ] ) ;
1306+ }
1307+
1308+ evaporatingBase [ location + i ] = '0' ; //can null a char so i use a number which doesnt' appear in peptide string
1309+ }
1310+ }
1311+ }
1312+ }
1313+
1314+ //We've kept amino acids in the digestion motif in the same position in the decoy peptide.
1315+ //Now we will fill the remaining open positions in the decoy with the scrambled amino acids from the target.
1316+ int extractPosition ;
1317+ int fillPosition ;
1318+ int residueNumsIndex ;
1319+ // Specify seed to ensure that the same decoy sequence is always generated from the target
1320+ Random rand = new ( 56 ) ;
1321+ double percentIdentity = 1 ;
1322+ int scrambleAttempt = 0 ;
1323+ int maxScrambles = 10 ;
1324+ double maxIdentity = maximumHomology ;
1325+ int characterCounter ;
1326+
1327+ while ( scrambleAttempt < maxScrambles && percentIdentity > maxIdentity )
1328+ {
1329+ // Copies the newModificationsDictionary for the scramble attempt
1330+ Dictionary < int , Modification > tempModificationsDictionary = new ( newModificationsDictionary ) ;
1331+ fillPosition = 0 ;
1332+ // residueNums is a list containing array indices for each element of evaporatingBase
1333+ // Once each amino acid is added, its index is removed from residueNums to prevent the same AA from being added 2x
1334+ var residueNums = Enumerable . Range ( 0 , evaporatingBase . Length ) . ToList ( ) ;
1335+ characterCounter = 0 ;
1336+ char [ ] tempNewBase = new char [ newBase . Length ] ;
1337+ // Create a copy of the newBase character array for the scrambling attempt
1338+ Array . Copy ( newBase , tempNewBase , newBase . Length ) ;
1339+
1340+ // I am not sure why I need the second counter, but it always works when I have it
1341+ int seqLength = this . BaseSequence . Length ;
1342+ while ( fillPosition < seqLength && characterCounter < seqLength )
1343+ {
1344+ residueNumsIndex = rand . Next ( residueNums . Count ) ;
1345+ extractPosition = residueNums [ residueNumsIndex ] ;
1346+ char targetAA = evaporatingBase [ extractPosition ] ;
1347+ residueNums . RemoveAt ( residueNumsIndex ) ;
1348+ if ( targetAA != '0' )
1349+ {
1350+ while ( tempNewBase [ fillPosition ] != '0' )
1351+ {
1352+ fillPosition ++ ;
1353+ }
1354+ tempNewBase [ fillPosition ] = targetAA ;
1355+ revisedAminoAcidOrder [ fillPosition ] = extractPosition ;
1356+ if ( this . AllModsOneIsNterminus . ContainsKey ( extractPosition + 2 ) )
1357+ {
1358+ tempModificationsDictionary . Add ( fillPosition + 2 , this . AllModsOneIsNterminus [ extractPosition + 2 ] ) ;
1359+ }
1360+ fillPosition ++ ;
1361+ }
1362+ characterCounter ++ ;
1363+ }
1364+ scrambleAttempt ++ ;
1365+ /*
1366+ * Any homology scoring mechanism can go here, percent identity is probably not the best
1367+ * In terms of generating a decoy sequence that will have a different mass spectrum than
1368+ * the original, it is far more important to vary the amino acids on the edges than
1369+ * those in the middle. Changes on the edges will offset the entire b and y sequences
1370+ * leading to an effective decoy spectrum even if there is high identity in the middle of
1371+ * the sequence. Additionally, for peptides with a large amount of a certain amino acid,
1372+ * it will be very difficult to generate a low homology sequence.
1373+ */
1374+ percentIdentity = GetPercentIdentity ( tempNewBase , evaporatingBase , tempModificationsDictionary , this . AllModsOneIsNterminus ) ;
1375+ // Check that the percent identity is below the maximum identity threshold and set actual values to the temporary values
1376+ if ( percentIdentity < maxIdentity )
1377+ {
1378+ newBase = tempNewBase ;
1379+ newModificationsDictionary = tempModificationsDictionary ;
1380+ // Code checking similarity between theoretical spectra could go here
1381+ }
12531382
1383+ // If max scrambles are reached, make the new sequence identical to the original to trigger mirroring
1384+ else if ( scrambleAttempt == maxScrambles )
1385+ {
1386+ for ( int j = 0 ; j < newBase . Length ; j ++ )
1387+ {
1388+ if ( newBase [ j ] == '0' )
1389+ {
1390+ newBase [ j ] = evaporatingBase [ j ] ;
1391+ }
1392+ }
1393+ }
1394+ }
1395+
1396+
1397+ string newBaseString = new string ( newBase ) ;
1398+
1399+ var proteinSequence = this . Protein . BaseSequence ;
1400+ var aStringBuilder = new StringBuilder ( proteinSequence ) ;
1401+ aStringBuilder . Remove ( this . OneBasedStartResidueInProtein - 1 , this . BaseSequence . Length ) ;
1402+ aStringBuilder . Insert ( this . OneBasedStartResidueInProtein - 1 , newBaseString ) ;
1403+ proteinSequence = aStringBuilder . ToString ( ) ;
1404+
1405+ Protein decoyProtein = new Protein ( proteinSequence , "DECOY_" + this . Protein . Accession , null , new List < Tuple < string , string > > ( ) , new Dictionary < int , List < Modification > > ( ) , null , null , null , true ) ;
1406+ DigestionParams d = this . DigestionParams ;
1407+ // Creates a hash code corresponding to the target's sequence
1408+ int targetHash = GetHashCode ( ) ;
1409+ PeptideWithSetModifications decoyPeptide ;
1410+ //Make the "peptideDescription" store the corresponding target's sequence
1411+ if ( newBaseString != this . BaseSequence )
1412+ {
1413+ decoyPeptide = new PeptideWithSetModifications ( decoyProtein , d , this . OneBasedStartResidueInProtein , this . OneBasedEndResidueInProtein , this . CleavageSpecificityForFdrCategory , this . FullSequence , this . MissedCleavages , newModificationsDictionary , this . NumFixedMods , newBaseString ) ;
1414+ // Sets PairedTargetDecoyHash of the original target peptie to the hash hode of the decoy sequence
1415+ PairedTargetDecoyHash = decoyPeptide . GetHashCode ( ) ;
1416+ // Sets PairedTargetDecoyHash of the decoy peptide to the hash code of the target sequence
1417+ decoyPeptide . PairedTargetDecoyHash = targetHash ;
1418+ return decoyPeptide ;
1419+
1420+ }
1421+ else
1422+ {
1423+ //The reverse decoy procedure failed to create a PeptideWithSetModificatons with a different sequence. Therefore,
1424+ //we retrun the mirror image peptide.
1425+ decoyPeptide = this . GetPeptideMirror ( revisedAminoAcidOrder ) ;
1426+ PairedTargetDecoyHash = decoyPeptide . GetHashCode ( ) ;
1427+ decoyPeptide . PairedTargetDecoyHash = targetHash ;
1428+ return decoyPeptide ;
1429+ }
1430+ }
1431+
1432+ /// <summary>
1433+ /// Method to get the percent identity between two peptide sequences stored as char[]
1434+ /// </summary>
1435+ /// <param name="scrambledSequence">Character array of the scrambled sequence</param>
1436+ /// <param name="unscrambledSequence">Character array of the unscrambled sequence</param>
1437+ /// <param name="scrambledMods">Dictionary containing the scrambled sequence's modifications</param>
1438+ /// <param name="unscrambledMods">Dictionary containing the unscrambled sequence's modifications</param>
1439+ /// <returns></returns>
1440+ private static double GetPercentIdentity ( char [ ] scrambledSequence , char [ ] unscrambledSequence , Dictionary < int , Modification > scrambledMods , Dictionary < int , Modification > unscrambledMods )
1441+ {
1442+ double rawScore = 0 ;
1443+ int seqLength = scrambledSequence . Length ;
1444+ for ( int i = 0 ; i < seqLength ; i ++ )
1445+ {
1446+ if ( scrambledSequence [ i ] == unscrambledSequence [ i ] || unscrambledSequence [ i ] == '0' )
1447+ {
1448+ Modification scrambledMod ;
1449+ if ( scrambledMods . TryGetValue ( i + 2 , out scrambledMod ) && unscrambledSequence [ i ] != '0' )
1450+ {
1451+ Modification unscrambledMod ;
1452+ if ( unscrambledMods . TryGetValue ( i + 2 , out unscrambledMod ) )
1453+ {
1454+ if ( scrambledMod == unscrambledMod )
1455+ {
1456+ rawScore += 1 ;
1457+ }
1458+ }
1459+ }
1460+ else
1461+ {
1462+ rawScore += 1 ;
1463+ }
1464+
1465+ }
1466+ }
1467+ return rawScore / seqLength ;
1468+ }
1469+
12541470 //Returns a PeptideWithSetModifications mirror image. Used when reverse decoy sequence is same as target sequence
12551471 public PeptideWithSetModifications GetPeptideMirror ( int [ ] revisedOrderNisOne )
12561472 {
0 commit comments