Skip to content

Commit 70e0033

Browse files
Jon PalmerJon Palmer
Jon Palmer
authored and
Jon Palmer
committed
updates to support checking BAM headers match FASTA headers
1 parent d49a60c commit 70e0033

File tree

5 files changed

+621
-2
lines changed

5 files changed

+621
-2
lines changed

README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,7 @@ funannotate sort -i genome.cleaned.fa -o genome.final.fa
120120
hisat2-build genome.final.fa genome
121121
122122
#now align reads to reference, using 12 cpus
123-
hisat2 --max_intronlen 3000 -p 12 -x genome -1 forward_R1.fastq -2 reverse_R2.fastq \
123+
hisat2 --max-intronlen 3000 -p 12 -x genome -1 forward_R1.fastq -2 reverse_R2.fastq \
124124
| samtools view -bS - | samtools sort -o RNA_seq.bam -
125125
```
126126
3b) You can also run something like Trinity/PASA/TransDecoder with the RNA-seq reads, for instructions see [Trinity](https://github.com/trinityrnaseq/trinityrnaseq/wiki/Running%20Trinity) and [PASA](http://pasapipeline.github.io).

bin/funannotate-predict.py

+10-1
Original file line numberDiff line numberDiff line change
@@ -215,6 +215,11 @@ def __init__(self, prog):
215215
if not header_test:
216216
lib.log.error("Fasta headers on your input have more characters than the max (%i), reformat headers to continue." % args.header_length)
217217
sys.exit(1)
218+
#if BAM file passed, check if headers are same as input
219+
if args.rna_bam:
220+
if not lib.BamHeaderTest(args.input, args.rna_bam):
221+
lib.log.error("Fasta headers in BAM file do not match genome, exiting.")
222+
sys.exit(1)
218223
#just copy the input fasta to the misc folder and move on.
219224
shutil.copyfile(args.input, genome_input)
220225
Genome = os.path.abspath(genome_input)
@@ -224,6 +229,11 @@ def __init__(self, prog):
224229
sys.exit(1)
225230
for x in [args.masked_genome, args.repeatmasker_gff3]:
226231
lib.checkinputs(x)
232+
#if BAM file passed, check if headers are same as input
233+
if args.rna_bam:
234+
if not lib.BamHeaderTest(args.masked_genome, args.rna_bam):
235+
lib.log.error("Fasta headers in BAM file do not match genome, exiting.")
236+
sys.exit(1)
227237
#now copy over masked genome and repeatmasker
228238
shutil.copyfile(args.masked_genome, genome_input)
229239
shutil.copyfile(args.masked_genome, MaskGenome)
@@ -239,7 +249,6 @@ def __init__(self, prog):
239249
Converter2 = os.path.join(EVM, 'EvmUtils', 'misc', 'augustus_GTF_to_EVM_GFF3.pl')
240250
EVM2proteins = os.path.join(EVM, 'EvmUtils', 'gff3_file_to_proteins.pl')
241251

242-
243252
#repeatmasker, run if not passed from command line
244253
if not os.path.isfile(MaskGenome):
245254
if not args.repeatmodeler_lib:

lib/library.py

+22
Original file line numberDiff line numberDiff line change
@@ -440,6 +440,28 @@ def checkFastaHeaders(input, limit):
440440
else:
441441
return True
442442

443+
def BamHeaderTest(genome, mapping):
444+
import pybam
445+
#get list of fasta headers from genome
446+
genome_headers = []
447+
with open(genome, 'rU') as input:
448+
for rec in SeqIO.parse(input, 'fasta'):
449+
if rec.id not in genome_headers:
450+
genome_headers.append(rec.id)
451+
#get list of fasta headers from BAM
452+
bam_headers = []
453+
with open(mapping, 'rU') as bamin:
454+
bam_file = pybam.bgunzip(bamin)
455+
bam_headers = bam_file.chromosomes_from_header
456+
#now compare lists, basically if BAM headers not in genome headers, then output bad names to logfile and return FALSE
457+
genome_headers = set(genome_headers)
458+
diffs = [x for x in bam_headers if x not in genome_headers]
459+
if len(diffs) > 0:
460+
log.debug("ERROR: These BAM headers not found in genome FASTA headers\n%s" % ','.join(diffs))
461+
return False
462+
else:
463+
return True
464+
443465
def gb2allout(input, GFF, Proteins, Transcripts, DNA):
444466
#this will not output any UTRs for gene models, don't think this is a problem right now....
445467
with open(GFF, 'w') as gff:

0 commit comments

Comments
 (0)