sbx_metaphlan/sbx_metaphlan.rules at master · sunbeam-labs/sbx_metaphlan · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
# -*- mode: Snakemake -*-

TARGET_METAPHLAN = expand(str(CLASSIFY_FP/'metaphlan'/'review'/'{sample}_review.txt'), sample = Samples.keys())
TARGET_METAPHLAN_REPORT = [str(CLASSIFY_FP/'metaphlan'/'taxonomic_assignments.tsv')]

# The original Anaconda environment path is stored here so it can be referred
# to from inside the special metaphlan environment used in select rules below.
import os
CONDA_PREFIX = os.getenv("CONDA_PREFIX")

rule all_metaphlan:
    input:
        TARGET_METAPHLAN_REPORT

# An all-samples-in-one summary table, with samples on columns and taxa on
# rows.
rule taxonomic_assignment_report:
    """ generate metaphlan taxonomic assignment table """
    output:
        str(CLASSIFY_FP/'metaphlan'/'taxonomic_assignments.tsv')
    input:
        fps = expand(str(CLASSIFY_FP/'metaphlan'/'review'/'{sample}_review.txt'), sample = Samples.keys())
    run:
        metaphlan_make_report(input.fps, output[0])

# Chunyu's curated version of the metaphlan output file.
rule metaphlan_review_output:
    output:
        meta_ret = str(CLASSIFY_FP/'metaphlan'/'review'/'{sample}_review.txt')
    input:
        meta_raw = str(CLASSIFY_FP/'metaphlan'/'raw'/'{sample}_metagenome.txt')
    run:
        review_output(input.meta_raw, output.meta_ret)

# Process bowtie2 output to create metaphlan's standard output file.
rule metaphlan_classify:
    output:
        meta_raw = str(CLASSIFY_FP/'metaphlan'/'raw'/'{sample}_metagenome.txt'),
    input:
        sambz2 = str(CLASSIFY_FP/'metaphlan'/'raw'/'{sample}.bowtie2.sam.bz2')
    threads:
        Cfg['sbx_metaphlan']['threads']
    conda:
        "sbx_metaphlan_env.yml"
    shell:
        """
            metaphlan2.py  \
            --nproc {threads} \
            --input_type sam \
            {input.sambz2} {output.meta_raw}
        """

# Run bowtie2 against metaphlan's database.  It can handle bzip2 compression
# automatically (above rule) so we'll include that here too.
rule metaphlan_bowtie:
    output:
        sambz2 = str(CLASSIFY_FP/'metaphlan'/'raw'/'{sample}.bowtie2.sam.bz2')
    input:
        pair = expand(str(QC_FP/'decontam'/'{sample}_{rp}.fastq.gz'),
                      sample = "{sample}",
                      rp = Pairs),
        dbdir = ancient(str(CLASSIFY_FP/'metaphlan'/'databases'))
    threads:
        Cfg['sbx_metaphlan']['threads']
    conda:
        "sbx_metaphlan_env.yml"
    shell:
        """
            bowtie2 --sam-no-hd --sam-no-sq --no-unal --very-sensitive \
            -x "{input.dbdir}/mpa_v20_m200" \
            -1 {input.pair[0]} -2 {input.pair[1]} -p {threads} \
            | bzip2 > {output.sambz2}
        """

# Make sure metaphlan has downloaded its database and indexed it for bowtie2,
# storing the files inside the Sunbeam environment in opt/metaphlan_databases,
# and symlink to it in the metaphlan output dir.
rule metaphlan_database:
    output:
        dbdir = ancient(str(CLASSIFY_FP/'metaphlan'/'databases'))
    conda:
        "sbx_metaphlan_env.yml"
    params:
        conda_prefix = CONDA_PREFIX
    shell:
        """
            # Snakemake by default fails the whole shell script if any command
            # within a pipe has a non-zero exit status.  Disabling that here.
            set +o pipefail

            # Link the two possible database locations (later versions of
            # metaphlan2 use "metaphlan_databases") from the metphlan
            # environment back out to the Sunbeam environment.
            # This allows easier access to the database on disk, whether it's
            # supplied automatically by metaphlan2 or manually before running
            # these rules.
            path_db="{params.conda_prefix}/opt/metaphlan_databases"
            mkdir -p "$path_db"
            ln -sfT "$path_db" "$CONDA_PREFIX"/bin/databases
            ln -sfT "$path_db" "$CONDA_PREFIX"/bin/metaphlan_databases

            function lsbt2 {{
            ls -1 "$CONDA_PREFIX"/bin/databases/mpa_v20_m200{{.{{1..4}},.rev.{{1..2}}}}.bt2 2> /dev/null | wc -l
            }}
            bt2=$(lsbt2)
            if [[ "$bt2" != 6 ]]
            then
                metaphlan2.py --install
            fi
            # To track completion of this task, place a symlink as an output
            # file pointing into the database location inside the metaphlan
            # environment.
            bt2=$(lsbt2)
            if [[ "$bt2" == 6 ]]
            then
                ln -sf "$CONDA_PREFIX/bin/databases" "{output.dbdir}"
            fi
        """

#################### Helper functions

def metaphlan_make_report(fps_input, fp_output):
    kos = []
    for fp in fps_input:
        # filter out the files that don't have any results
        if os.path.getsize(fp) > 1:
            # build pandas dataframes for each file of results
            kos.append(parse_results(fp,".txt"))
    # merge the column results
    kos = pandas.concat(kos, axis=1)

    # write them to file. Replace NAs (due to merging) with 0.
    kos.to_csv(fp_output, sep='\t', na_rep=0, index_label="Term")

def parse_results(fp, input_suffix):
    """ Return a DataFrame containing the results of one sample"""
    sample_name = os.path.basename(fp).rsplit(input_suffix)[0]
    return pandas.read_csv(fp, sep='\t', index_col=0, names=[sample_name], skiprows=1)

# Chunyu's function to filter to just species-level classifications, excluding
# strain-level.
def review_output(raw_fp, output_fp):
     with open(raw_fp) as f:
        output_lines = f.read().splitlines(True)

        if len(output_lines) < 2:
            raise ValueError("Metaphlan output has fewer than 2 lines.")
        elif len(output_lines) == 2:
            revised_output_lines = "".join(output_lines)
        else:
            header = output_lines.pop(0)
            revised_output_lines = [header]
            for line in output_lines:
                if (("s__" in line) and not ("t__" in line)) or (("unclassified" in line) and not ("t__" in line)):
                    revised_output_lines.append(line)
            revised_output_lines = "".join(revised_output_lines)

        with open(output_fp, 'w') as f:
            f.write(revised_output_lines)