Skip to content

Commit 816cbc8

Browse files
committed
Version 0.5
This update brings a new output format to stringMLST. NOtably, versions prior would return no output (columns omitted) for alleles which had 0 kmer hits. stringMLST will now return "NA" values for alleles with no hits. This should hopefully make the output more obvious in cases such as #35
1 parent 0f8602a commit 816cbc8

File tree

1 file changed

+44
-24
lines changed

1 file changed

+44
-24
lines changed

stringMLST.py

100644100755
Lines changed: 44 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
except ImportError:
1616
from urllib import urlopen, urlretrieve
1717
import argparse
18-
version = """ stringMLST v0.4.2 (updated : September 13, 2017) """
18+
version = """ stringMLST v0.5 (updated : January 5, 2018) """
1919
"""
2020
2121
stringMLST free for academic users and requires permission before any commercial
@@ -515,6 +515,7 @@ def goodReads(read, k, non_overlapping_window):
515515
s = str(line[n:n+k])
516516
if s in kmerDict[k]:
517517
for probLoc in kmerDict[k][s]:
518+
# print(probLoc)
518519
if probLoc not in alleleCount:
519520
alleleCount[probLoc] = {}
520521
a = kmerDict[k][s][probLoc]
@@ -558,6 +559,8 @@ def getMaxCount(alleleCount, fileName):
558559
maxSupport = {}
559560
secondSupport = {}
560561
finalProfileCount = {}
562+
for locus in alleleNames:
563+
finalProfileCount[locus] = {}
561564
num = ''
562565
for loc in alleleCount:
563566
n = 0
@@ -567,32 +570,42 @@ def getMaxCount(alleleCount, fileName):
567570
m = n
568571
n = alleleCount[loc][num]
569572
if n-m < fuzzy:
570-
alleleCount[loc][num] = str(alleleCount[loc][num])+'*'
571-
max_n[loc] = str(n)+'*'
573+
try:
574+
alleleCount[loc][num]
575+
except:
576+
pass
577+
else:
578+
alleleCount[loc][num] = str(alleleCount[loc][num])+'*'
579+
max_n[loc] = str(n)+'*'
572580
else:
573581
max_n[loc] = n
574582
secondMax[loc] = m
575583
for loc in alleleCount:
576-
maxSupport[loc] = {}
577-
secondSupport[loc] = {}
578-
num_max = []
579-
num_max2 = []
580-
compare = float(re.sub("\*$", "", str(max_n[loc])))
581-
for num in alleleCount[loc]:
582-
if alleleCount[loc][num] == compare:
583-
if "\*" in str(max_n[loc]):
584-
insert = num + '*'
585-
num_max.append(insert)
586-
else:
587-
num_max.append(num)
588-
maxSupport[loc][num] = max_n[loc]
589-
if alleleCount[loc][num] == secondMax[loc]:
590-
num_max2.append(num)
591-
secondSupport[loc][num] = secondMax[loc]
592584
try:
593-
finalProfileCount[loc] = num_max[0]
594-
except LookupError:
595-
finalProfileCount[loc] = '0'
585+
max_n[loc]
586+
except:
587+
pass
588+
else:
589+
maxSupport[loc] = {}
590+
secondSupport[loc] = {}
591+
num_max = []
592+
num_max2 = []
593+
compare = float(re.sub("\*$", "", str(max_n[loc])))
594+
for num in alleleCount[loc]:
595+
if alleleCount[loc][num] == compare:
596+
if "\*" in str(max_n[loc]):
597+
insert = num + '*'
598+
num_max.append(insert)
599+
else:
600+
num_max.append(num)
601+
maxSupport[loc][num] = max_n[loc]
602+
if alleleCount[loc][num] == secondMax[loc]:
603+
num_max2.append(num)
604+
secondSupport[loc][num] = secondMax[loc]
605+
try:
606+
finalProfileCount[loc] = num_max[0]
607+
except LookupError:
608+
finalProfileCount[loc] = '0'
596609
msgs = "Max Support :" + fileName + " : " + str(maxSupport)
597610
logging.debug(msgs)
598611
msgs = "Second Max Support :" + fileName + " : " + str(secondSupport)
@@ -630,7 +643,8 @@ def findST(finalProfile, stProfile):
630643
exit(0)
631644
transformedFinalProfile = {}
632645
for gene, allele in finalProfile.items():
633-
allele = re.sub("\*", "", allele)
646+
if allele:
647+
allele = re.sub("\*", "", allele)
634648
transformedFinalProfile[finalGeneToSTGene[gene]] = allele
635649
# Check to see if the dictionary is empty, if so then means no allele were found at all
636650
if bool(transformedFinalProfile) is False:
@@ -698,10 +712,13 @@ def loadKmerDict(dbFile):
698712
kmerTableDict = {}
699713
with open(dbFile, 'r') as kmerTableFile:
700714
lines = kmerTableFile.readlines()
715+
global alleleNames
716+
alleleNames = set()
701717
for line in lines:
702718
array = line.rstrip().rsplit('\t')
703719
kmerTableDict[array[0]] = {}
704720
kmerTableDict[array[0]][array[1]] = array[2][1:-1].rsplit(',')
721+
alleleNames.add(array[1])
705722
return kmerTableDict
706723
#############################################################
707724
# Function : loadWeightDict
@@ -820,7 +837,10 @@ def printResults(results, output_filename, overwrite, timeDisp):
820837
for l in sorted(results[s]):
821838
if l == 'ST' or l == 't':
822839
continue
823-
sample += '\t'+results[s][l]
840+
if results[s][l]:
841+
sample += '\t'+results[s][l]
842+
else:
843+
sample += '\tNA'
824844
if timeDisp is True:
825845
sample += '\t' + str(results[s]['ST']) + '\t%.2f ' %results[s]['t']
826846
else:

0 commit comments

Comments
 (0)