diff --git a/bin/create_metabinner_bins.py b/bin/create_metabinner_bins.py index 120a65187..9396a101d 100755 --- a/bin/create_metabinner_bins.py +++ b/bin/create_metabinner_bins.py @@ -1,16 +1,23 @@ #!/usr/bin/env python -## Originally written by Hesham Almessady (@HeshamAlmessady) and Adrian Fritz (@AlphaSquad) in https://github.com/hzi-bifo/mag and released under the MIT license. +## Originally written by Hesham Almessady (@HeshamAlmessady) and Adrian Fritz (@AlphaSquad) +# in https://github.com/hzi-bifo/mag and released under the MIT license. ## See git repository (https://github.com/nf-core/mag) for full license text. -import sys +import gzip +import io import os +import sys + from Bio import SeqIO + def main(): # Argument parsing if len(sys.argv) != 6: - print("Usage: python create_metabinner_bins.py ") + print( + "Usage: python create_metabinner_bins.py " + ) sys.exit(1) binning = sys.argv[1] @@ -19,27 +26,39 @@ def main(): prefix = sys.argv[4] length = int(sys.argv[5]) - # Create output directory if it doesn't exist + root = os.path.dirname(os.path.normpath(path)) or "." os.makedirs(path, exist_ok=True) - # Load binning data into a dictionary - Metabinner_bins = {} - with open(binning, 'r') as b: + metabinner_bins = {} + with open(binning, "r") as b: for line in b: - contig, bin = line.strip().split('\t') - Metabinner_bins[contig] = bin + contig, bin = line.strip().split("\t") + metabinner_bins[contig] = bin + + handles = {} + + def get_handle(dest_dir, fname): + key = os.path.join(dest_dir, fname) + if key not in handles: + raw = open(key + ".gz", "wb") + gz = gzip.GzipFile(fileobj=raw, mode="wb", mtime=0) + handles[key] = (io.TextIOWrapper(gz, encoding="utf-8"), raw) + return handles[key][0] - # Process the input fasta file with open(fasta) as handle: for record in SeqIO.parse(handle, "fasta"): - if len(record) < length: - f = prefix + ".tooShort.fa" - elif record.id not in Metabinner_bins: - f = prefix + ".unbinned.fa" + if len(record) <= length: + out = get_handle(root, prefix + ".tooShort.fa") + elif record.id not in metabinner_bins: + out = get_handle(root, prefix + ".unbinned.fa") else: - f = prefix + "." + Metabinner_bins[record.id] + ".fa" - with open(os.path.join(path, f), 'a') as out: - SeqIO.write(record, out, "fasta") + out = get_handle(path, prefix + "." + metabinner_bins[record.id] + ".fa") + SeqIO.write(record, out, "fasta") + + for text, raw in handles.values(): + text.close() + raw.close() + if __name__ == "__main__": main() diff --git a/modules/local/metabinner_bins/main.nf b/modules/local/metabinner_bins/main.nf index 14eb96355..efa2621a5 100644 --- a/modules/local/metabinner_bins/main.nf +++ b/modules/local/metabinner_bins/main.nf @@ -25,17 +25,12 @@ process METABINNER_BINS { # unzip membership file zcat ${membership} > membership.tsv - # collect bins & un-binned fractions create_metabinner_bins.py \\ membership.tsv \\ ${fasta} \\ ./bins \\ ${prefix} \\ ${min_contig_size} - find ./bins/ -name "*.fa" -type f | xargs -t -n 1 bgzip -@ ${task.cpus} - - # zip contig fractions - find ./bins/ -name "*[tooShort,unbinned].fa.gz" -type f -exec mv {} . \\; cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/subworkflows/local/binning_metabinner/main.nf b/subworkflows/local/binning_metabinner/main.nf index 2adbdd888..562396f09 100644 --- a/subworkflows/local/binning_metabinner/main.nf +++ b/subworkflows/local/binning_metabinner/main.nf @@ -11,22 +11,18 @@ workflow BINNING_METABINNER { main: ch_versions = channel.empty() + ch_assembly = ch_input.map { meta, assembly, _depths -> [meta, assembly] } + // produce k-mer composition table METABINNER_KMER( - ch_input - .map { meta, assembly, _depths -> - [meta, assembly] - }, + ch_assembly, params.min_contig_size ) ch_versions = ch_versions.mix(METABINNER_KMER.out.versions) // extract contigs over length threshold METABINNER_TOOSHORT( - ch_input - .map { meta, assembly, _depths -> - [meta, assembly] - }, + ch_assembly, params.min_contig_size ) ch_versions = ch_versions.mix(METABINNER_TOOSHORT.out.versions) @@ -41,8 +37,7 @@ workflow BINNING_METABINNER { // extract bin sequences METABINNER_BINS( - ch_input.map { meta, assembly, _depths -> [meta, assembly] } - .join(METABINNER_METABINNER.out.membership), + ch_assembly.join(METABINNER_METABINNER.out.membership), params.min_contig_size ) ch_versions = ch_versions.mix(METABINNER_BINS.out.versions)