Skip to content

pplacer -> Epa-ng #7

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# Ignore everything
*

# Allow requirements file
!/requirements.txt
9 changes: 4 additions & 5 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,14 +1,13 @@
FROM python:3.11-slim-bullseye
FROM python:3.11.4-slim-bookworm

# Install prerequisites
RUN apt-get update && \
apt-get upgrade --assume-yes && \
apt-get install --assume-yes --no-install-recommends wget

# ADD bin/install_pplacer.sh /tmp/install_pplacer.sh
# RUN /tmp/install_pplacer.sh

RUN pip install pandas csvkit fastalite
WORKDIR /usr/local/src/yapp/
ADD requirements.txt .
RUN pip install --requirement requirements.txt

# create some mount points
RUN mkdir -p /app /fh /mnt /run/shm
Expand Down
42 changes: 27 additions & 15 deletions SConstruct
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ to_remove = input.get('to_remove')
singularity = conf['singularity'].get('singularity', 'singularity')
deenurp_img = conf['singularity']['deenurp']
dada2_img = conf['singularity']['dada2']
csvkit_img = conf['singularity']['csvkit']
epa_img = conf['singularity']['epa']
yapp_img = conf['singularity']['yapp']

binds = [os.path.abspath(pth)
Expand All @@ -130,17 +130,10 @@ vars.Add('venv', None, venv)
# paths here to avoid accidental introduction of external
# dependencies.

# find the execution path for singularity; assumes 'ml Singularity' has been run
try:
singularity_bin = [pth for pth in os.environ['PATH'].split(':')
if 'singularity' in pth][0]
except IndexError:
sys.exit('PATH for Singularity not found: try running\nml Singularity')

env = SlurmEnvironment(
ENV=dict(
os.environ,
PATH=':'.join(['bin', path.join(venv, 'bin'), singularity_bin,
PATH=':'.join(['bin', path.join(venv, 'bin'),
'/usr/local/bin', '/usr/bin', '/bin']),
SLURM_ACCOUNT='fredricks_d',
OMP_NUM_THREADS=args.nproc),
Expand All @@ -153,7 +146,7 @@ env = SlurmEnvironment(
binds=' '.join('-B {}'.format(pth) for pth in ['$cwd'] + binds),
deenurp_img=('$singularity exec $binds --pwd $cwd {}'.format(deenurp_img)),
dada2_img=('$singularity exec $binds --pwd $cwd {}'.format(dada2_img)),
csvkit_img=('$singularity exec $binds --pwd $cwd {}'.format(csvkit_img)),
epa_img=('$singularity exec $binds --pwd $cwd {}'.format(epa_img)),
yapp_img=('$singularity exec $binds --pwd $cwd {}'.format(yapp_img)),
min_reads=min_reads,
)
Expand All @@ -173,6 +166,7 @@ for_transfer = [settings]

profile = common.taxit_rp(refpkg, 'profile', img=deenurp_img, singularity=singularity)
ref_sto = common.taxit_rp(refpkg, 'aln_sto', img=deenurp_img, singularity=singularity)
ref_tree = common.taxit_rp(refpkg, 'tree', img=deenurp_img, singularity=singularity)

# filter non-16s reads with cmsearch
seqs_16s, seqs_not16s, cmsearch_scores = env.Command(
Expand Down Expand Up @@ -221,11 +215,28 @@ merged, = env.Command(
'rm ${TARGET}.temp')
)

reference = env.Command(
target='$out/refs_merged.afa',
source=[merged, ref_sto],
action='unmerge.py --stockholm ${SOURCES[1]} --out $TARGET ${SOURCES[0]}')

query = env.Command(
target='$out/query_merged.afa',
source=[merged, query_sto],
action='unmerge.py --stockholm ${SOURCES[1]} --out $TARGET ${SOURCES[0]}')

dedup_jplace, = env.Command(
target='$out/dedup.jplace',
source=[refpkg, merged],
action=('$deenurp_img pplacer -p --inform-prior --prior-lower 0.01 --map-identity '
'-c $SOURCES -o $TARGET -j $nproc'),
target='$out/epa_result.jplace',
source=[query, reference, ref_tree],
action='$epa_img epa-ng '
# '--filter-max 999999999 ' # all placements
'--model GTR+G '
'--out-dir $out '
'--query ${SOURCES[0]} '
'--redo '
'--ref-msa ${SOURCES[1]} '
'--threads $nproc '
'--tree ${SOURCES[2]}'
# ncores=nproc,
# slurm_queue=large_queue
)
Expand All @@ -239,10 +250,11 @@ classify_db, = env.Command(
action=('rm -f $TARGET && '
'$deenurp_img rppr prep_db -c ${SOURCES[1]} --sqlite $TARGET && '
'$deenurp_img guppy classify '
'--pp --classifier hybrid2 ' # TODO: specify pplacer settings in config
'--classifier hybrid2 ' # TODO: specify pplacer settings in config
'-j $nproc '
'${SOURCES[0]} ' # placefile
'-c ${SOURCES[1]} '
'--nbc-rank species '
'--nbc-sequences ${SOURCES[2]} '
'--sqlite $TARGET ')
)
Expand Down
2 changes: 1 addition & 1 deletion bin/get_classifications.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,7 +213,7 @@ def main(arguments):
try:
new_tax_id, __, __ = tax.primary_from_name(tax_name)
except ValueError:
log.error(f'could not find {tax_name} in taxonomy')
log.error('could not find {} in taxonomy'.format(tax_name))
missing_tax_names.append(tax_name)
else:
new_tax_ids.append(new_tax_id)
Expand Down
27 changes: 27 additions & 0 deletions bin/jplace.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
from Bio import Phylo


class JParser(Phylo.NewickIO.Parser):
def _parse_tree(self, text):
tree = super()._parse_tree(text)
return Phylo.PhyloXML.Phylogeny(
root=tree.root,
rooted=tree.rooted)

def new_clade(sef, parent=None):
clade = JClade()
if parent:
clade.parent = parent
return clade


class JClade(Phylo.PhyloXML.Clade):
def __init__(self, edge=None, **kwds):
Phylo.PhyloXML.Clade.__init__(self, **kwds)
self.edge = edge

def __setattr__(self, name, value):
if name == 'name' and value and value.startswith('{'):
self.edge = int(value.strip('{}'))
else:
super().__setattr__(name, value)
2 changes: 1 addition & 1 deletion bin/sv_table.R
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ suppressPackageStartupMessages(library(argparse, quietly = TRUE))
suppressPackageStartupMessages(library(dplyr, quietly = TRUE))
suppressPackageStartupMessages(library(tidyr, quietly = TRUE))

options(error=recover)
options(error=NULL)
options(width=200)

concat <- function(...){
Expand Down
70 changes: 70 additions & 0 deletions bin/unmerge.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
#!/usr/bin/env python3
import argparse
import bz2
import csv
import io
import jplace
import json
import sys

from Bio import SeqIO


def get_args():
parser = argparse.ArgumentParser()
parser.add_argument('merged', type=argparse.FileType('r'))
parser.add_argument('--blast-details')
parser.add_argument('--jplace', type=argparse.FileType('r'))
parser.add_argument('--limit', type=int)
parser.add_argument('--name')
parser.add_argument('--seqs', type=argparse.FileType('r'))
parser.add_argument('--sort-by', default='likelihood')
parser.add_argument('--specimen')
parser.add_argument('--stockholm', type=argparse.FileType('r'))
parser.add_argument(
'--out', default=sys.stdout, type=argparse.FileType('w'))
return parser.parse_args()


def main():
args = get_args()
names = {} # {name: confidence}
if args.blast_details:
details = csv.DictReader(bz2.open(args.blast_details, 'rt'))
details = (d for d in details if d['specimen'] == args.specimen)
details = (d for d in details if d['pident'])
details = (d for d in details if d['assignment_id'])
details = sorted(
details, key=lambda x: float(x['pident']), reverse=True)
for d in details:
names[d['sseqid']] = float('-inf')
if args.name:
names[args.name] = 0
if args.stockholm:
for a in SeqIO.parse(args.stockholm, 'stockholm'):
names[a.name] = float('-inf')
if args.jplace:
jp = json.load(args.jplace)
tree = next(jplace.JParser(io.StringIO(jp['tree'])).parse())
pls = {}
if jp['placements']:
pls = jp['placements'][0]['p']
pls = [{f: p[i] for i, f in enumerate(jp['fields'])} for p in pls]
pls = {p['edge_num']: p for p in pls}
clades = (c for c in tree.find_clades() if c.edge in pls)
for c in clades:
if c.is_terminal():
names[c.name] = pls[c.edge][args.sort_by]
seqs = SeqIO.parse(args.merged, 'fasta')
seqs = [s for s in seqs if s.name in names]
if not seqs and args.seqs:
seqs = SeqIO.parse(args.seqs, 'fasta')
seqs = [s for s in seqs if s.name in names]
seqs = sorted(seqs, key=lambda s: names[s.name], reverse=True)
if args.limit:
seqs = seqs[:args.limit]
SeqIO.write(seqs, args.out, 'fasta')


if __name__ == '__main__':
sys.exit(main())
8 changes: 5 additions & 3 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
scons>=3.0.1
biopython
bioscons>=0.9.0


csvkit
fastalite
pandas
scons>=3.0.1