diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..ef985a5 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,5 @@ +# Ignore everything +* + +# Allow requirements file +!/requirements.txt diff --git a/Dockerfile b/Dockerfile index 6d7428f..91418d8 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,14 +1,13 @@ -FROM python:3.11-slim-bullseye +FROM python:3.11.4-slim-bookworm # Install prerequisites RUN apt-get update && \ apt-get upgrade --assume-yes && \ apt-get install --assume-yes --no-install-recommends wget -# ADD bin/install_pplacer.sh /tmp/install_pplacer.sh -# RUN /tmp/install_pplacer.sh - -RUN pip install pandas csvkit fastalite +WORKDIR /usr/local/src/yapp/ +ADD requirements.txt . +RUN pip install --requirement requirements.txt # create some mount points RUN mkdir -p /app /fh /mnt /run/shm diff --git a/SConstruct b/SConstruct index 04cd8c9..6e1bafd 100644 --- a/SConstruct +++ b/SConstruct @@ -104,7 +104,7 @@ to_remove = input.get('to_remove') singularity = conf['singularity'].get('singularity', 'singularity') deenurp_img = conf['singularity']['deenurp'] dada2_img = conf['singularity']['dada2'] -csvkit_img = conf['singularity']['csvkit'] +epa_img = conf['singularity']['epa'] yapp_img = conf['singularity']['yapp'] binds = [os.path.abspath(pth) @@ -130,17 +130,10 @@ vars.Add('venv', None, venv) # paths here to avoid accidental introduction of external # dependencies. -# find the execution path for singularity; assumes 'ml Singularity' has been run -try: - singularity_bin = [pth for pth in os.environ['PATH'].split(':') - if 'singularity' in pth][0] -except IndexError: - sys.exit('PATH for Singularity not found: try running\nml Singularity') - env = SlurmEnvironment( ENV=dict( os.environ, - PATH=':'.join(['bin', path.join(venv, 'bin'), singularity_bin, + PATH=':'.join(['bin', path.join(venv, 'bin'), '/usr/local/bin', '/usr/bin', '/bin']), SLURM_ACCOUNT='fredricks_d', OMP_NUM_THREADS=args.nproc), @@ -153,7 +146,7 @@ env = SlurmEnvironment( binds=' '.join('-B {}'.format(pth) for pth in ['$cwd'] + binds), deenurp_img=('$singularity exec $binds --pwd $cwd {}'.format(deenurp_img)), dada2_img=('$singularity exec $binds --pwd $cwd {}'.format(dada2_img)), - csvkit_img=('$singularity exec $binds --pwd $cwd {}'.format(csvkit_img)), + epa_img=('$singularity exec $binds --pwd $cwd {}'.format(epa_img)), yapp_img=('$singularity exec $binds --pwd $cwd {}'.format(yapp_img)), min_reads=min_reads, ) @@ -173,6 +166,7 @@ for_transfer = [settings] profile = common.taxit_rp(refpkg, 'profile', img=deenurp_img, singularity=singularity) ref_sto = common.taxit_rp(refpkg, 'aln_sto', img=deenurp_img, singularity=singularity) +ref_tree = common.taxit_rp(refpkg, 'tree', img=deenurp_img, singularity=singularity) # filter non-16s reads with cmsearch seqs_16s, seqs_not16s, cmsearch_scores = env.Command( @@ -221,11 +215,28 @@ merged, = env.Command( 'rm ${TARGET}.temp') ) +reference = env.Command( + target='$out/refs_merged.afa', + source=[merged, ref_sto], + action='unmerge.py --stockholm ${SOURCES[1]} --out $TARGET ${SOURCES[0]}') + +query = env.Command( + target='$out/query_merged.afa', + source=[merged, query_sto], + action='unmerge.py --stockholm ${SOURCES[1]} --out $TARGET ${SOURCES[0]}') + dedup_jplace, = env.Command( - target='$out/dedup.jplace', - source=[refpkg, merged], - action=('$deenurp_img pplacer -p --inform-prior --prior-lower 0.01 --map-identity ' - '-c $SOURCES -o $TARGET -j $nproc'), + target='$out/epa_result.jplace', + source=[query, reference, ref_tree], + action='$epa_img epa-ng ' + # '--filter-max 999999999 ' # all placements + '--model GTR+G ' + '--out-dir $out ' + '--query ${SOURCES[0]} ' + '--redo ' + '--ref-msa ${SOURCES[1]} ' + '--threads $nproc ' + '--tree ${SOURCES[2]}' # ncores=nproc, # slurm_queue=large_queue ) @@ -239,10 +250,11 @@ classify_db, = env.Command( action=('rm -f $TARGET && ' '$deenurp_img rppr prep_db -c ${SOURCES[1]} --sqlite $TARGET && ' '$deenurp_img guppy classify ' - '--pp --classifier hybrid2 ' # TODO: specify pplacer settings in config + '--classifier hybrid2 ' # TODO: specify pplacer settings in config '-j $nproc ' '${SOURCES[0]} ' # placefile '-c ${SOURCES[1]} ' + '--nbc-rank species ' '--nbc-sequences ${SOURCES[2]} ' '--sqlite $TARGET ') ) diff --git a/bin/get_classifications.py b/bin/get_classifications.py index 5e34083..5c5c7c1 100755 --- a/bin/get_classifications.py +++ b/bin/get_classifications.py @@ -213,7 +213,7 @@ def main(arguments): try: new_tax_id, __, __ = tax.primary_from_name(tax_name) except ValueError: - log.error(f'could not find {tax_name} in taxonomy') + log.error('could not find {} in taxonomy'.format(tax_name)) missing_tax_names.append(tax_name) else: new_tax_ids.append(new_tax_id) diff --git a/bin/jplace.py b/bin/jplace.py new file mode 100644 index 0000000..1bbd773 --- /dev/null +++ b/bin/jplace.py @@ -0,0 +1,27 @@ +from Bio import Phylo + + +class JParser(Phylo.NewickIO.Parser): + def _parse_tree(self, text): + tree = super()._parse_tree(text) + return Phylo.PhyloXML.Phylogeny( + root=tree.root, + rooted=tree.rooted) + + def new_clade(sef, parent=None): + clade = JClade() + if parent: + clade.parent = parent + return clade + + +class JClade(Phylo.PhyloXML.Clade): + def __init__(self, edge=None, **kwds): + Phylo.PhyloXML.Clade.__init__(self, **kwds) + self.edge = edge + + def __setattr__(self, name, value): + if name == 'name' and value and value.startswith('{'): + self.edge = int(value.strip('{}')) + else: + super().__setattr__(name, value) diff --git a/bin/sv_table.R b/bin/sv_table.R index 3bdc9bb..25bec2f 100644 --- a/bin/sv_table.R +++ b/bin/sv_table.R @@ -4,7 +4,7 @@ suppressPackageStartupMessages(library(argparse, quietly = TRUE)) suppressPackageStartupMessages(library(dplyr, quietly = TRUE)) suppressPackageStartupMessages(library(tidyr, quietly = TRUE)) -options(error=recover) +options(error=NULL) options(width=200) concat <- function(...){ diff --git a/bin/unmerge.py b/bin/unmerge.py new file mode 100755 index 0000000..e7b55da --- /dev/null +++ b/bin/unmerge.py @@ -0,0 +1,70 @@ +#!/usr/bin/env python3 +import argparse +import bz2 +import csv +import io +import jplace +import json +import sys + +from Bio import SeqIO + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument('merged', type=argparse.FileType('r')) + parser.add_argument('--blast-details') + parser.add_argument('--jplace', type=argparse.FileType('r')) + parser.add_argument('--limit', type=int) + parser.add_argument('--name') + parser.add_argument('--seqs', type=argparse.FileType('r')) + parser.add_argument('--sort-by', default='likelihood') + parser.add_argument('--specimen') + parser.add_argument('--stockholm', type=argparse.FileType('r')) + parser.add_argument( + '--out', default=sys.stdout, type=argparse.FileType('w')) + return parser.parse_args() + + +def main(): + args = get_args() + names = {} # {name: confidence} + if args.blast_details: + details = csv.DictReader(bz2.open(args.blast_details, 'rt')) + details = (d for d in details if d['specimen'] == args.specimen) + details = (d for d in details if d['pident']) + details = (d for d in details if d['assignment_id']) + details = sorted( + details, key=lambda x: float(x['pident']), reverse=True) + for d in details: + names[d['sseqid']] = float('-inf') + if args.name: + names[args.name] = 0 + if args.stockholm: + for a in SeqIO.parse(args.stockholm, 'stockholm'): + names[a.name] = float('-inf') + if args.jplace: + jp = json.load(args.jplace) + tree = next(jplace.JParser(io.StringIO(jp['tree'])).parse()) + pls = {} + if jp['placements']: + pls = jp['placements'][0]['p'] + pls = [{f: p[i] for i, f in enumerate(jp['fields'])} for p in pls] + pls = {p['edge_num']: p for p in pls} + clades = (c for c in tree.find_clades() if c.edge in pls) + for c in clades: + if c.is_terminal(): + names[c.name] = pls[c.edge][args.sort_by] + seqs = SeqIO.parse(args.merged, 'fasta') + seqs = [s for s in seqs if s.name in names] + if not seqs and args.seqs: + seqs = SeqIO.parse(args.seqs, 'fasta') + seqs = [s for s in seqs if s.name in names] + seqs = sorted(seqs, key=lambda s: names[s.name], reverse=True) + if args.limit: + seqs = seqs[:args.limit] + SeqIO.write(seqs, args.out, 'fasta') + + +if __name__ == '__main__': + sys.exit(main()) diff --git a/requirements.txt b/requirements.txt index 3312205..2741e1a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,6 @@ -scons>=3.0.1 +biopython bioscons>=0.9.0 - - +csvkit +fastalite +pandas +scons>=3.0.1