diff --git a/README.md b/README.md index 6b38e55..7dec7cc 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # sbx_gene_family -Reads-level based alignment to gene clusters of interest, e.g. bai operon or butyrate producing genes. Please refer to [sunbeam_database](https://github.com/zhaoc1/sunbeam_databases.git) for details. +This extension can be used to perform functional mapping, i.e. mapping metagenomic reads to proteins. The database to map against could be [UniRef50](https://www.uniprot.org/downloads), all prokaryotic proteins from [KEGG](https://www.kegg.jp/kegg/download/), or more targeted [databases](https://github.com/zhaoc1/sunbeam_databases.git), e.g. bai operon or butyrate producing genes. Take [**UniRef50** database](https://www.uniprot.org/downloads) as an example. First download the uniref50.fasta into your current `sunbeam_output/mapping/sbx_gene_family/databases/`. @@ -34,7 +34,7 @@ Take [**UniRef50** database](https://www.uniprot.org/downloads) as an example. F 4. Run time - - Use diamond +By default, mapping uses DIAMOND, but this extension also supports using BLAST. ```bash sunbeam run -- --configfile sunbeam_config.yml all_gene_family diff --git a/config.yml b/config.yml index d3fe4e8..35c3361 100644 --- a/config.yml +++ b/config.yml @@ -1,6 +1,6 @@ sbx_gene_clusters: threads: 8 genes_fp: "{PROJECT_FP}"/mapping/sbx_gene_family/database - evalue: 1e-5 + evalue: 1e-5 # this value is good for read lengths ~126 bp but should be bigger for shorter reads, e.g. 1e-3 for 75bp reads alnLen: 30 - mismatch: 3 \ No newline at end of file + mismatch: 3 diff --git a/sbx_gene_clusters.rules b/sbx_gene_clusters.rules index e3d8161..77abfbf 100644 --- a/sbx_gene_clusters.rules +++ b/sbx_gene_clusters.rules @@ -5,16 +5,17 @@ # Rules for Diamond or BLASTx reads against protein databases import csv +import gzip import os from collections import Counter, OrderedDict -#GENES_DIR = Cfg['sbx_gene_clusters']['genes_fp'] -#GENES_KEY = [PurePath(f.name).stem for f in GENES_DIR.glob('*.fasta')] -#GENES_VAL = [str(GENES_DIR) + '/' + g+'.fasta' for g in GENES_KEY] -#GENES_DICT = dict(zip(GENES_KEY, GENES_VAL)) +GENES_DIR = Cfg['sbx_gene_clusters']['genes_fp'] +GENES_KEY = [PurePath(f.name).stem for f in GENES_DIR.glob('*.fasta')] +GENES_VAL = [str(GENES_DIR) + '/' + g+'.fasta' for g in GENES_KEY] +GENES_DICT = dict(zip(GENES_KEY, GENES_VAL)) -#TARGET_GENES = expand(str(MAPPING_FP/'sbx_gene_family'/'{gene}'/'{sample}_1.txt'), -# gene=GENES_DICT.keys(), sample=Samples.keys()) +TARGET_GENES = expand(str(MAPPING_FP/'sbx_gene_family'/'{gene}'/'{sample}_1.txt'), + gene=GENES_DICT.keys(), sample=Samples.keys()) rule all_gene_family: input: @@ -72,8 +73,15 @@ def write_gene_hits(in_fp, out_fp, db_annot_fp, evalue, alnLen, mismatch): ## Read in the alignment file to count counter_genes = Counter() + + # unzip if necessary + if os.path.splitext(in_fp)[1] == ".gz": + open_fxn = lambda in_fp: gzip.open(in_fp, mode="rt") + else: + open_fxn = lambda in_fp: open(in_fp) + if os.path.getsize(in_fp) > 0: - with open(in_fp) as in_file: + with open_fxn(in_fp) as in_file: reader = csv.reader(in_file, delimiter='\t') # initialize the csv reader aln_chunk = [next(reader)] #initialize the seed chunk @@ -103,6 +111,8 @@ rule gene_hits: evalue = float(Cfg['sbx_gene_clusters']['evalue']), alnLen = Cfg['sbx_gene_clusters']['alnLen'], mismatch = Cfg['sbx_gene_clusters']['mismatch'] + resources: + mem_mb=8192 run: write_gene_hits(input.aln_fp, output[0], input.db_annot_fp[0], params.evalue, params.alnLen, params.mismatch) @@ -155,19 +165,25 @@ rule fq_2_fa: rule diamond_reads: input: - read = str(MAPPING_FP/'R1'/'{sample}_1.fasta'), + # input can be fastq.gz + read = str(QC_FP/'decontam'/'{sample}_1.fastq.gz'), db = expand(str(GENES_DIR/'{{gene}}.fasta.{index}'), index=['dmnd']) output: - str(MAPPING_FP/'sbx_gene_family'/'{gene}'/'{sample}_1.m8') + str(MAPPING_FP/'sbx_gene_family'/'{gene}'/'{sample}_1.m8.gz') threads: Cfg['sbx_gene_clusters']['threads'] + resources: + mem_mb=16384 + params: + evalue=Cfg['sbx_gene_clusters']['evalue'] shell: """ diamond blastx \ --db {input.db} --query {input.read} \ - --threads {threads} --evalue 1e-6 \ + --threads {threads} --evalue {params.evalue} \ --max-target-seqs 0 \ --out {output} \ + --compress 1 \ --outfmt 6 qseqid sseqid pident qlen slen length mismatch gapopen qstart qend sstart send evalue bitscore """