@@ -123,8 +123,7 @@ def makeChild(subTree, start):
123123
124124 else :
125125 missense = 0
126-
127-
126+
128127 if (
129128 row ["Variant_Type" ] == "SNP"
130129 or row ["Variant_Type" ] == "DNP"
@@ -463,9 +462,8 @@ def find_most_similar_string(target, strings):
463462 first_AA_same_score ,
464463 max_score ,
465464 )
466-
467465 NMD_dict = {}
468-
466+
469467 for index_mut , row_mut in neoantigen_mut_in .iterrows ():
470468 row_MUT_identity = trim_id (row_mut ["Identity" ])
471469 IDsplit = row_MUT_identity .split ("_" )
@@ -534,7 +532,7 @@ def find_most_similar_string(target, strings):
534532 #This block takes care of Missense mutations caused by polymorphisims
535533 matchfound = True
536534 best_pepmatch = WTdict [WTid ]["peptide" ]
537-
535+
538536 else :
539537 # Here we take care of INDELS and everything else
540538
@@ -579,26 +577,21 @@ def find_most_similar_string(target, strings):
579577 + 1
580578 )
581579
582-
580+
583581 chrom , pos = mutation_dict [row_mut ["Identity" ]].split ("_" )[0 :2 ]
584-
585-
586-
582+
587583 if frameshift :
588584 mut_pos = "Frameshifted peptide"
589585 num_windows = len (list (WTdict [no_positon_ID ]["peptides" ].keys ()))
590586 num_windows_li = len (list (WTdict [no_positon_ID ]["peptides" ].values ()))
591- # [pep.split(",") for pep in num_windows_li]
592-
593- # print(WTdict[no_positon_ID]["peptides"])
587+
594588 print ("FRAMESHIFT" )
595-
589+
596590 if no_positon_ID in NMD_dict :
597591 #NMD must only be calculated once per mutation
598592 pass
599- else :
593+ else :
600594 split_mutation_dict_ID = mutation_dict [row_MUT_identity ].split ("_" )
601- print (split_mutation_dict_ID )
602595 if split_mutation_dict_ID [2 ] == "I" :
603596 len_indel = len (split_mutation_dict_ID [3 ])
604597 elif split_mutation_dict_ID [3 ] == "D" :
@@ -607,10 +600,10 @@ def find_most_similar_string(target, strings):
607600 len_indel = 0
608601 transcriptID = chrom_pos_dict [mutation_dict [row_MUT_identity ]]["transcript" ]
609602 NMD_dict [no_positon_ID ] = determine_NMD (chrom , pos ,num_windows ,len_indel ,ensembl ,transcriptID )
610-
603+
611604 else :
612605 NMD_dict [no_positon_ID ] = "False"
613-
606+
614607 if SV :
615608 neo_dict = {
616609 "id" : row_MUT_identity
@@ -993,7 +986,7 @@ def ensembl_load(release, gtf_file, cdna_file):
993986 # ensembl = EnsemblRelease(int(release))
994987 # ensembl.download()
995988 # ensembl.index()
996-
989+
997990 # else:
998991 ensembl = Genome (gtf_path_or_url = gtf_file ,
999992 transcript_fasta_paths_or_urls = cdna_file ,
@@ -1011,18 +1004,17 @@ def get_exons_from_transcriptID(transcriptid, ensembl, complete=True):
10111004 :param complete: only consider complete transcripts
10121005 :return: a list of tuples of exon ranges
10131006 """
1014-
1007+
10151008 transcripts = ensembl .exon_ids_of_transcript_id (transcriptid )
1016-
1009+
10171010 exon_ranges = []
10181011 for exonid in transcripts :
10191012 exon = ensembl .exon_by_id (exonid )
10201013 exon_ranges .append ((exon .start , exon .end ))
10211014
10221015 return exon_ranges
1023-
1024-
1025-
1016+
1017+
10261018def determine_NMD (chrom , pos ,num_windows ,len_indel , ensembl , transcriptID = None ):
10271019 """
10281020 :param chrom: chromosome where alteration takes place
@@ -1034,7 +1026,7 @@ def determine_NMD(chrom, pos,num_windows,len_indel, ensembl, transcriptID=None):
10341026 :param cdna_file: the path of cdna file if release == custom
10351027 :return: NMD value
10361028 """
1037-
1029+
10381030 if transcriptID == None :
10391031 #do these if maf was not annotated
10401032 transcript = get_transcript (chrom , pos , ensembl )
@@ -1044,62 +1036,50 @@ def determine_NMD(chrom, pos,num_windows,len_indel, ensembl, transcriptID=None):
10441036
10451037 NMD = "False"
10461038
1047- pos = int (pos )
1039+ pos = int (pos ) + 1
10481040 for i in range (0 ,len (exon_ranges )):
1049- if pos > exon_ranges [i ][0 ] and pos < exon_ranges [i ][1 ]:
1050-
1051- exon_ranges_dist = [exon_ranges [p ][1 ]- exon_ranges [p ][0 ] for p in range (i ,len (exon_ranges ))]
1052-
1053- mut_to_stop_dist = (num_windows * 3 )+ len_indel + 1
1054-
1041+ if pos >= exon_ranges [i ][0 ] and pos <= exon_ranges [i ][1 ]:
1042+ exon_ranges_dist = [exon_ranges [p ][1 ]- exon_ranges [p ][0 ] for p in range (0 ,len (exon_ranges ))]
1043+ mut_to_stop_dist = (num_windows * 3 )+ len_indel + 1
1044+
10551045 for d in range (i ,len (exon_ranges_dist )):
10561046 if exon_ranges_dist [d ] == exon_ranges_dist [i ]:
10571047 dist = exon_ranges_dist [d ] - (exon_ranges [i ][1 ]- pos )
1058-
10591048 else :
10601049 dist = exon_ranges_dist [d ]
1061-
1062- if exon_ranges_dist [d ] - mut_to_stop_dist >= 0 :
1063-
1064- PTC_exon = exon_ranges [d ]
1050+
1051+ if dist - mut_to_stop_dist >= 0 :
1052+ PTC_exon = exon_ranges [d ]
1053+ PTC_pos = exon_ranges [d ][0 ] + mut_to_stop_dist
1054+ break
1055+ elif len (exon_ranges_dist )- 1 == d and dist - mut_to_stop_dist < 0 :
1056+ PTC_exon = exon_ranges [d ]
10651057 PTC_pos = exon_ranges [d ][0 ] + mut_to_stop_dist
1066- # print(("Found everything!",PTC_exon,PTC_pos,mut_to_stop_dist))
10671058 else :
10681059 mut_to_stop_dist = mut_to_stop_dist - dist
1069- # print((exon_ranges_dist[d],mut_to_stop_dist))
10701060
10711061 if PTC_exon == exon_ranges [- 1 ]:
10721062 # "on the last exon"
10731063 NMD = "Last Exon"
1074- print ("on the last exon" )
1075- else :
1064+ else :
10761065 if exon_ranges [0 ][0 ] - PTC_pos < 150 :
10771066 # less than 150 nt away from the start exon
10781067 NMD = "Start-proximal"
1079- # print((exon_ranges[0][0] , PTC_pos,exon_ranges[0][0] - PTC_pos))
1080- # print("less than 150 nt away from the start exon")
10811068 else :
10821069 if (PTC_exon [1 ] - PTC_exon [0 ]) > 407 :
10831070 # in a long exon with more than 407 nt
10841071 NMD = "Long Exon"
1085- print ("in a long exon with more than 407 nt" )
10861072 else :
10871073 # it is in the last 50 nt of the penultimate exon
10881074 if PTC_exon == exon_ranges [- 2 ] and (exon_ranges [- 2 ][0 ] - PTC_pos ) < 50 :
10891075 NMD = "50nt Rule"
1090- # print(num_windows)
1091- # print(exon_ranges)
1092- # print(("mut EXON:",exon_ranges[i]))
1093- # print(("PTC EXON",PTC_exon))
1094- # print(exon_ranges[i][-2:])
1095- print ("within 50 nt of the penultimate exon" )
10961076 else :
10971077 NMD = "Trigger NMD"
10981078
10991079 return NMD
1100-
1101-
1102-
1080+
1081+
1082+
11031083def parse_args ():
11041084 parser = argparse .ArgumentParser (description = "Process input files and parameters" )
11051085 parser .add_argument ("--maf_file" , required = True , help = "Path to the MAF file" )
@@ -1125,7 +1105,7 @@ def parse_args():
11251105 help = 'GTF file for the reference.' )
11261106 parser .add_argument ('-cf' , '--cdna-file' , dest = 'cdna_file' , metavar = 'CDNA_FILE' , default = None ,
11271107 help = 'cDNA file for the reference.' )
1128-
1108+
11291109 parser .add_argument ("--id" , required = True , help = "ID" )
11301110 parser .add_argument ("--patient_id" , required = True , help = "Patient ID" )
11311111 parser .add_argument ("--cohort" , required = True , help = "Cohort" )
0 commit comments