Skip to content

Commit 05b1810

Browse files
author
Jon Palmer
committed
use bedtools sort for intersect files #522
1 parent 39a0b70 commit 05b1810

File tree

1 file changed

+27
-8
lines changed

1 file changed

+27
-8
lines changed

funannotate/library.py

Lines changed: 27 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1482,7 +1482,7 @@ def sortBedproper(input, output):
14821482
cols = line.split('\t')
14831483
data.append(cols)
14841484
# we can now sort
1485-
sort_data = sorted(data, key=lambda x: (x[0], int(x[1])))
1485+
sort_data = natsorted(data, key=lambda x: (x[0], int(x[1])))
14861486
# now we can write back out to file
14871487
with open(output, 'w') as outfile:
14881488
for x in sort_data:
@@ -1516,7 +1516,7 @@ def sortGFFproper(input, output):
15161516
order_map[x] = idx
15171517
idx += 1
15181518
# we can now sort
1519-
sort_data = sorted(data, key=lambda x: (x[0], int(x[3]), order_map[x[2]]))
1519+
sort_data = natsorted(data, key=lambda x: (x[0], int(x[3]), order_map[x[2]]))
15201520
# now we can write back out to file
15211521
with open(output, 'w') as outfile:
15221522
for y in comments:
@@ -5710,16 +5710,29 @@ def SortRenameHeaders(input, output):
57105710
def validate_tRNA(input, genes, gaps, output):
57115711
# run bedtools intersect to keep only input that dont intersect with either genes or gaps
57125712
sortedInput = os.path.abspath(input)+'.sorted.gff3'
5713-
sortGFFproper(input, sortedInput)
5713+
#sortGFFproper(input, sortedInput)
5714+
cmd1 = ['bedtools', 'sort', '-i', input]
5715+
with open(sortedInput, 'w') as outfile:
5716+
subprocess.call(cmd1, stdout=outfile)
57145717
sortedGenes = os.path.abspath(genes)+'.sorted.gff3'
5715-
sortGFFproper(genes, sortedGenes)
5718+
#sortGFFproper(genes, sortedGenes)
5719+
cmd2 = ['bedtools', 'sort', '-i', genes]
5720+
with open(sortedGenes, 'w') as outfile:
5721+
subprocess.call(cmd2, stdout=outfile)
57165722
if gaps:
57175723
sortedGaps = os.path.abspath(gaps)+'.sorted.gff3'
5718-
sortGFFproper(gaps, sortedGaps)
5724+
#sortGFFproper(gaps, sortedGaps)
5725+
cmd3 = ['bedtools', 'sort', '-i', gaps]
5726+
with open(sortedGaps, 'w') as outfile:
5727+
subprocess.call(cmd3, stdout=outfile)
57195728
cmd = ['bedtools', 'intersect', '-sorted', '-v', '-a', sortedInput, '-b', sortedGenes]
57205729
if gaps:
57215730
cmd.append(sortedGaps)
5722-
runSubprocess2(cmd, '.', log, output)
5731+
tmpOut = os.path.abspath(output)+'.tmp'
5732+
runSubprocess2(cmd, '.', log, tmpOut)
5733+
# now sort properly
5734+
sortGFFproper(tmpOut, output)
5735+
os.remove(tmpOut)
57235736

57245737

57255738
# via https://stackoverflow.com/questions/2154249/identify-groups-of-continuous-numbers-in-a-list
@@ -6592,8 +6605,14 @@ def RemoveBadModels(proteins, gff, length, repeats, BlastResults, tmpdir, method
65926605
repeat_temp = os.path.join(tmpdir, 'genome.repeats.to.remove.gff')
65936606
gffSorted = os.path.abspath(gff)+'.sorted.gff'
65946607
bedSorted = os.path.abspath(repeats)+'.sorted.bed'
6595-
sortBedproper(repeats, bedSorted)
6596-
sortGFFproper(gff, gffSorted)
6608+
#sortBedproper(repeats, bedSorted)
6609+
cmd1 = ['bedtools', 'sort', '-i', repeats]
6610+
with open(bedSorted, 'w') as bedout:
6611+
subprocess.call(cmd1, stdout=bedout)
6612+
#sortGFFproper(gff, gffSorted)
6613+
cmd2 = ['bedtools', 'sort', '-i', gff]
6614+
with open(gffSorted, 'w') as gffout:
6615+
subprocess.call(cmd2, stdout=gffout)
65976616
cmd = ['bedtools', 'intersect', '-sorted', '-f', '0.9', '-a', gffSorted, '-b', bedSorted]
65986617
runSubprocess2(cmd, '.', log, repeat_temp)
65996618
# parse the results from bedtools and add to remove list

0 commit comments

Comments
 (0)