diff --git a/.github/workflows/nf-test.yml b/.github/workflows/nf-test.yml index f56397fbd..45c8b606f 100644 --- a/.github/workflows/nf-test.yml +++ b/.github/workflows/nf-test.yml @@ -64,7 +64,7 @@ jobs: runs-on: # use self-hosted runners - runs-on=${{ github.run_id }}-nf-test - runner=4cpu-linux-x64 - - disk=large + - volume=80gb strategy: fail-fast: false matrix: diff --git a/.github/workflows/release-announcements.yml b/.github/workflows/release-announcements.yml index 8509c7bbe..431d3d445 100644 --- a/.github/workflows/release-announcements.yml +++ b/.github/workflows/release-announcements.yml @@ -17,8 +17,7 @@ jobs: - name: get description id: get_description run: | - echo "description=$(curl -s https://nf-co.re/pipelines.json | jq -r '.remote_workflows[] | select(.full_name == "${{ github.repository }}") | .description' >> $GITHUB_OUTPUT - + echo "description=$(curl -s https://nf-co.re/pipelines.json | jq -r '.remote_workflows[] | select(.full_name == "${{ github.repository }}") | .description')" >> $GITHUB_OUTPUT - uses: rzr/fediverse-action@master with: access-token: ${{ secrets.MASTODON_ACCESS_TOKEN }} @@ -27,9 +26,7 @@ jobs: # https://docs.github.com/en/developers/webhooks-and-events/webhooks/webhook-events-and-payloads#release message: | Pipeline release! ${{ github.repository }} v${{ github.event.release.tag_name }} - ${{ github.event.release.name }}! - - ${{ steps.get_topics.outputs.description }} - + ${{ steps.get_description.outputs.description }} Please see the changelog: ${{ github.event.release.html_url }} ${{ steps.get_topics.outputs.topics }} #nfcore #openscience #nextflow #bioinformatics diff --git a/.nf-core.yml b/.nf-core.yml index 46aa16f34..4c6b27d7b 100644 --- a/.nf-core.yml +++ b/.nf-core.yml @@ -16,4 +16,4 @@ template: name: mag org: nf-core outdir: . - version: 5.1.0 + version: 5.2.0 diff --git a/CHANGELOG.md b/CHANGELOG.md index 131566026..e39c9d1ee 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,7 +3,38 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## v5.1.0 - 2025-10-27 +## v5.2.0 - Puce Pangolin [2025-11-07] + +### `Added` + +- [#842](https://github.com/nf-core/mag/pull/842) - Add support for running multiple binQC tools in one run using dedicated `--run_busco`, `--run_checkm`, and `--run_checkm2` parameters (by @harper357, with contributions from @dialvarezs, @prototaxites and @jfy133) +- [#881](https://github.com/nf-core/mag/pull/881) - Add binner MetaBinner (by @d4straub, insprired by @HeshamAlmessady & @AlphaSquad) + +### `Changed` + +- [#842](https://github.com/nf-core/mag/pull/842) - Change `bin_summary.tsv` format for improved clarity and more comprehensiveness (by @harper357, with contributions from @dialvarezs, @prototaxites and @jfy133) + - Now will include columns from all bin QC tools executed in a given run (i.e., all/any of BUSCO, CheckM and CheckM2) + - Adds suffixes to all columns (`_`) to distinguish which column comes from which tool + +### `Fixed` + +- [#896](https://github.com/nf-core/mag/pull/896) - Remove obsolete execution command from README (by @dialvarezs) +- [#907](https://github.com/nf-core/mag/pull/907) - Include refined bins from all binners in the `DASTool/bins` output folder (by @AlexHoratio) +- [#911](https://github.com/nf-core/mag/pull/911) - Ensure column order is consistent when generating depth summaries to prevent swapped results on merged depth summary (by @dialvarezs) +- [#912](https://github.com/nf-core/mag/pull/912) - Fix validation of multiple sequencing platforms when using `binning_map_mode = "all"` (reported by @mjfi2sb3, fix by @dialvarezs) +- [#921](https://github.com/nf-core/mag/pull/921) - Fix publishing of BUSCO files (reported by @joao1980, fix by @dialvarezs) + +### `Dependencies` + +| Tool | Previous version | New version | +| ---------- | ---------------- | ----------- | +| MetaBinner | | 1.4.4-0 | + +### `Deprecated` + +- [#842](https://github.com/nf-core/mag/pull/842) - Remove `--binqc_tool` (by @harper357, with contributions from @dialvarezs, @prototaxites and @jfy133) + +## v5.1.0 - Platinum Pudu [2025-10-27] ### `Added` @@ -30,7 +61,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### `Deprecated` -## 5.0.0 - [2025-09-30] +## v5.0.0 - Green Squirrel [2025-09-30] ### `Added` @@ -102,7 +133,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - [#855](https://github.com/nf-core/mag/pull/855) - Remove test_adapterremoval, test_ancient_dna, test_bbnorm, test_busco_auto, test_host_rm, test_hybrid_host_rm, test_binrefinement, test_concoct and test_longread profiles (added by @dialvarezs) - [#864](https://github.com/nf-core/mag/pull/864) - Remove `--gtdb_mash` due to dropping of support by GTDBTk itself (by @prototaxites and @jfy133) -## v4.0.0 - [2025-05-22] +## v4.0.0 - Blue Huemul [2025-05-22] ### `Added` @@ -138,7 +169,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - [#730](https://github.com/nf-core/mag/pull/730) - Remove `--busco_auto_lineage_prok` due to update and simplified usage of BUSCO (added by @jfy133, @dialvarezs) -## 3.4.0 [2025-04-04] +## v3.4.0 - Green Gecko [2025-04-04] ### `Added` @@ -187,7 +218,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### `Deprecated` -## 3.3.0 [2024-12-19] +## v3.3.0 - Red Reindeer [2024-12-19] ### `Added` @@ -222,7 +253,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### `Deprecated` -## 3.2.1 [2024-10-30] +## v3.2.1 [2024-10-30] ### `Added` @@ -236,7 +267,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### `Deprecated` -## 3.2.0 [2024-10-27] +## v3.2.0 - Salmon Salmon [2024-10-27] ### `Added` @@ -268,7 +299,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### `Deprecated` -## 3.1.0 [2024-10-04] +## v3.1.0 - Pink Panda [2024-10-04] ### `Added` @@ -296,7 +327,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - [#670](https://github.com/nf-core/mag/pull/670) - Deprecated `--gtdbtk_pplacer_scratch` due to unintuitive usage (reported by harper357, fixed by @jfy133) -## 3.0.3 [2024-08-27] +## v3.0.3 [2024-08-27] ### `Added` @@ -315,7 +346,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### `Deprecated` -## 3.0.2 [2024-07-04] +## v3.0.2 [2024-07-04] ### `Added` @@ -338,7 +369,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### `Deprecated` -## 3.0.1 [2024-06-10] +## v3.0.1 [2024-06-10] ### `Added` @@ -358,7 +389,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### `Deprecated` -## 3.0.0 - [2024-05-13] +## v3.0.0 - Magenta Magpie [2024-05-13] ### `Added` @@ -379,7 +410,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - [#599](https://github.com/nf-core/mag/pull/599) - Direct reads input (`--input 'sample_{R1,R2}.fastq.gz'`) is no longer supported, all input must come via samplesheets (by @jfy133) -## 2.5.4 - [2024-02-12] +## v2.5.4 [2024-02-12] ### `Added` @@ -396,7 +427,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### `Deprecated` -## 2.5.3 - [2024-02-05] +## v2.5.3 [2024-02-05] ### `Added` @@ -412,7 +443,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### `Deprecated` -## 2.5.2 - [2024-02-02] +## v2.5.2 [2024-02-02] ### `Added` @@ -437,7 +468,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### `Deprecated` -## 2.5.1 - [2023-11-17] +## v2.5.1 [2023-11-17] ### `Added` @@ -459,7 +490,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - [#536](https://github.com/nf-core/mag/pull/536) - Remove custom function with native Nextflow for checking file extension (reported by @d4straub, fix by @jfy133) -## 2.5.0 - [2023-10-10] +## 2.5.0 - Aquamarine Kangaroo - [2023-10-10] ### `Added` @@ -484,7 +515,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - [#504](https://github.com/nf-core/mag/pull/504) - `--busco_reference`, `--busco_download_path`, `--save_busco_reference` parameters have been deprecated and replaced with new parameters (by @gregorysprenger). -## 2.4.0 - 2023-09-26 +## v2.4.0 - Grey Hammerhead [2023-09-26] ### `Added` @@ -548,7 +579,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - [#458](https://github.com/nf-core/mag/pull/458) - Correct the major issue in ancient DNA workflow of binning refinement being performed on uncorrected contigs instead of aDNA consensus recalled contigs (issue [#449](https://github.com/nf-core/mag/issues/449)) - [#451](https://github.com/nf-core/mag/pull/451) - Fix results file overwriting in Ancient DNA workflow (reported by @alexhbnr, fix by @jfy133, and integrated by @maxibor in [#458](https://github.com/nf-core/mag/pull/458) ) -## v2.3.0 - [2023/03/02] +## v2.3.0 - Red Cow [2023-03-02] ### `Added` @@ -585,7 +616,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 | Freebayes | 1.3.5 | 1.3.6 | | SAMtools | 1.15 | 1.16.1 | -## v2.2.1 - 2022/08/25 +## v2.2.1 [2022-08-25] ### `Added` @@ -598,7 +629,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### `Dependencies` -## v2.2.0 - 2022/06/14 +## v2.2.0 - Golden Mammoth [2022-06-14] ### `Added` @@ -631,7 +662,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 | fastp | 0.20.1 | 0.23.2 | | MultiQC | 1.11 | 1.12 | -## v2.1.1 - 2021/11/25 +## v2.1.1 [2021-11-25] ### `Added` @@ -654,7 +685,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - [#255](https://github.com/nf-core/mag/pull/255) - Update gtdbtk conda channel. - [#258](https://github.com/nf-core/mag/pull/258) - FastP results are now in MultiQC. -## v2.1.0 - 2021/07/29 +## v2.1.0 - Black Zebra [2021-07-29] ### `Added` @@ -670,7 +701,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - [#226](https://github.com/nf-core/mag/pull/226) - Fix handling of `BUSCO` output when run in auto lineage selection mode and selected specific lineage is the same as the generic one. -## v2.0.0 - 2021/06/01 +## v2.0.0 - Silver Swan [2021-06-01] ### `Added` @@ -705,7 +736,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - [#195](https://github.com/nf-core/mag/pull/195) - Fix documentation regarding required compression of input FastQ files [#160](https://github.com/nf-core/mag/issues/160) - [#196](https://github.com/nf-core/mag/pull/196) - Add process for CAT database creation as solution for problem caused by incompatible `DIAMOND` version used for pre-built `CAT database` and `CAT classification` [#90](https://github.com/nf-core/mag/issues/90), [#188](https://github.com/nf-core/mag/issues/188) -## v1.2.0 - 2021/02/10 +## v1.2.0 - Yellow Squirrel [2021-02-10] ### `Added` @@ -725,7 +756,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - [#143](https://github.com/nf-core/mag/pull/143) - Change parameter: `--manifest` -> `--input` -## v1.1.2 - 2020/11/24 +## v1.1.2 - Blue Panda [2020-11-24] ### `Changed` @@ -735,7 +766,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - [#133](https://github.com/nf-core/mag/pull/133) - Fixed processing of `--input` parameter [#131](https://github.com/nf-core/mag/issues/131) -## v1.1.1 - 2020/11/10 +## v1.1.1 - Lime Owl [2020-11-10] ### `Added` @@ -752,7 +783,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - [#120](https://github.com/nf-core/mag/pull/120) - Fix link to CAT database in help message - [#124](https://github.com/nf-core/mag/pull/124) - Fix description of `CAT` process in `output.md` -## v1.1.0 - 2020/10/06 +## v1.1.0 - White Elephant [2020-10-06] ### `Added` @@ -797,7 +828,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - [#29](https://github.com/nf-core/mag/pull/29) - Change depreciated parameters: `--singleEnd` -> `--single_end`, `--igenomesIgnore` -> `--igenomes_ignore` -## v1.0.0 - 2019/12/20 +## v1.0.0 - Purple Corgi [2019-12-20] Initial release of nf-core/mag, created with the [nf-core](http://nf-co.re/) template. diff --git a/CITATIONS.md b/CITATIONS.md index 112caffd6..f73b5c8b7 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -52,6 +52,10 @@ > Alneberg, J., Bjarnason, B. S., de Bruijn, I., Schirmer, M., Quick, J., Ijaz, U. Z., Lahti, L., Loman, N. J., Andersson, A. F., & Quince, C. (2014). Binning metagenomic contigs by coverage and composition. Nature Methods, 11(11), 1144–1146. doi: 10.1038/nmeth.3103 +- [MetaBinner](https://doi.org/10.1186/s13059-022-02832-6) + + > Wang Z, Huang P, You R, Sun F, Zhu S. MetaBinner: a high-performance and stand-alone ensemble binning method to recover individual genomes from complex microbial communities. Genome Biol. 2023 Jan 6;24(1):1. doi: 10.1186/s13059-022-02832-6. PMID: 36609515; PMCID: PMC9817263. + - [DAS Tool](https://doi.org/10.1038/s41564-018-0171-1) > Sieber, C. M. K., et al. 2018. "Recovery of Genomes from Metagenomes via a Dereplication, Aggregation and Scoring Strategy." Nature Microbiology 3 (7): 836-43. doi: 10.1038/s41564-018-0171-1 diff --git a/README.md b/README.md index 2d0ff7d4b..750b04e4a 100644 --- a/README.md +++ b/README.md @@ -47,7 +47,7 @@ The pipeline then: - performs assembly using [MEGAHIT](https://github.com/voutcn/megahit) and [SPAdes](http://cab.spbu.ru/software/spades/), and checks their quality using [Quast](http://quast.sourceforge.net/quast) - (optionally) performs ancient DNA assembly validation using [PyDamage](https://github.com/maxibor/pydamage) and contig consensus sequence recalling with [Freebayes](https://github.com/freebayes/freebayes) and [BCFtools](http://samtools.github.io/bcftools/bcftools.html) - predicts protein-coding genes for the assemblies using [Prodigal](https://github.com/hyattpd/Prodigal), and bins with [Prokka](https://github.com/tseemann/prokka) and optionally [MetaEuk](https://www.google.com/search?channel=fs&client=ubuntu-sn&q=MetaEuk) -- performs metagenome binning using [MetaBAT2](https://bitbucket.org/berkeleylab/metabat/src/master/), [MaxBin2](https://sourceforge.net/projects/maxbin2/), [CONCOCT](https://github.com/BinPro/CONCOCT), and/or [COMEBin](https://github.com/ziyewang/COMEBin) +- performs metagenome binning using [MetaBAT2](https://bitbucket.org/berkeleylab/metabat/src/master/), [MaxBin2](https://sourceforge.net/projects/maxbin2/), [CONCOCT](https://github.com/BinPro/CONCOCT), [COMEBin](https://github.com/ziyewang/COMEBin), and/or [MetaBinner](https://github.com/ziyewang/MetaBinner) - checks the quality of the genome bins using [Busco](https://busco.ezlab.org/), [CheckM](https://ecogenomics.github.io/CheckM/), or [CheckM2](https://github.com/chklovski/CheckM2) and optionally [GUNC](https://grp-bork.embl-community.io/gunc/) - Performs ancient DNA validation and repair with [pyDamage](https://github.com/maxibor/pydamage) and [freebayes](https://github.com/freebayes/freebayes) - optionally refines bins with [DAS Tool](https://github.com/cmks/DAS_Tool) @@ -60,12 +60,6 @@ Furthermore, the pipeline creates various reports in the results directory speci > [!NOTE] > If you are new to Nextflow and nf-core, please refer to [this page](https://nf-co.re/docs/usage/installation) on how to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline) with `-profile test` before running the workflow on actual data. -```bash -nextflow run nf-core/mag -profile --input '*_R{1,2}.fastq.gz' --outdir -``` - -or - ```bash nextflow run nf-core/mag -profile --input samplesheet.csv --outdir ``` @@ -107,6 +101,7 @@ Other code contributors include: - [Nikolaos Vergoulidis](https://github.com/IceGreb) - [Greg Fedewa](https://github.com/harper357) - [Vini Salazar](https://github.com/vinisalazar) +- [Alex Caswell](https://github.com/AlexHoratio) Long read processing was inspired by [caspargross/HybridAssembly](https://github.com/caspargross/HybridAssembly) written by Caspar Gross [@caspargross](https://github.com/caspargross) diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml index 41f6a817a..b7ca4dbbb 100644 --- a/assets/multiqc_config.yml +++ b/assets/multiqc_config.yml @@ -1,5 +1,5 @@ report_comment: > - This report has been generated by the nf-core/mag analysis pipeline. For information about how to interpret these results, please see the documentation. + This report has been generated by the nf-core/mag analysis pipeline. For information about how to interpret these results, please see the documentation. report_section_order: "nf-core-mag-methods-description": order: -1000 diff --git a/bin/combine_tables.py b/bin/combine_tables.py index 2cff1f854..37ecba8a6 100755 --- a/bin/combine_tables.py +++ b/bin/combine_tables.py @@ -19,13 +19,22 @@ def parse_args(args=None): metavar="FILE", help="Bin depths summary file.", ) - parser.add_argument("-b", "--binqc_summary", metavar="FILE", help="BUSCO summary file.") - parser.add_argument("-q", "--quast_summary", metavar="FILE", help="QUAST BINS summary file.") - parser.add_argument("-g", "--gtdbtk_summary", metavar="FILE", help="GTDB-Tk summary file.") - parser.add_argument("-a", "--cat_summary", metavar="FILE", help="CAT table file.") parser.add_argument( - "-t", "--binqc_tool", help="Bin QC tool used", choices=["busco", "checkm", "checkm2"] + "-q", "--quast_summary", metavar="FILE", help="QUAST BINS summary file." + ) + parser.add_argument( + "-g", "--gtdbtk_summary", metavar="FILE", help="GTDB-Tk summary file." + ) + parser.add_argument( + "-u", "--busco_summary", metavar="FILE", help="BUSCO summary file." ) + parser.add_argument( + "-c", "--checkm_summary", metavar="FILE", help="CheckM summary file." + ) + parser.add_argument( + "-e", "--checkm2_summary", metavar="FILE", help="CheckM2 summary file." + ) + parser.add_argument("-a", "--cat_summary", metavar="FILE", help="CAT table file.") parser.add_argument( "-o", @@ -74,12 +83,15 @@ def parse_cat_table(cat_table): header=None, skiprows=1, ) - # merge all rank columns into a single column + ## merge all rank columns into a single column df["CAT_rank"] = ( - df.filter(regex="rank_\d+").apply(lambda x: ";".join(x.dropna()), axis=1).str.lstrip() + df.filter(regex="rank_\d+") + .apply(lambda x: ";".join(x.dropna()), axis=1) + .str.lstrip() ) - # remove rank_* columns + ## remove rank_* columns df.drop(df.filter(regex="rank_\d+").columns, axis=1, inplace=True) + df = df.add_suffix("_catpack") return df @@ -87,41 +99,65 @@ def parse_cat_table(cat_table): def main(args=None): args = parse_args(args) + ## INPUT VALIDATION + if ( - not args.binqc_summary + not args.busco_summary + and not args.checkm_summary + and not args.checkm2_summary and not args.quast_summary and not args.gtdbtk_summary ): sys.exit( "No summary specified! " - "Please specify at least BUSCO, CheckM, CheckM2 or QUAST summary." + "Please specify at least one of BUSCO, CheckM, CheckM2 or QUAST summary." ) - # GTDB-Tk can only be run in combination with BUSCO, CheckM or CheckM2 - if args.gtdbtk_summary and not args.binqc_summary: + ## GTDB-Tk can only be run in combination with BUSCO, CheckM or CheckM2 + if ( + args.gtdbtk_summary + and not args.busco_summary + and not args.checkm_summary + and not args.checkm2_summary + ): sys.exit( "Invalid parameter combination: " - "GTDB-TK summary specified, but no BUSCO, CheckM or CheckM2 summary!" + "GTDB-TK summary specified, but no BUSCO, CheckM or CheckM2 summary provided!" ) - # handle bin depths + ## BIN DEPTH PROCESSING + + ## handle bin depths, and extract root bin names results = pd.read_csv(args.depths_summary, sep="\t") - results.columns = ["Depth " + str(col) if col != "bin" else col for col in results.columns] + results.columns = [ + "Depth " + str(col) if col != "bin" else col for col in results.columns + ] bins = results["bin"].sort_values().reset_index(drop=True) - if args.binqc_summary and args.binqc_tool == "busco": - busco_results = pd.read_csv(args.binqc_summary, sep="\t") + ## BUSCO PROCESSING + + if args.busco_summary: + busco_results = pd.read_csv(args.busco_summary, sep="\t") busco_bins = set(busco_results["Input_file"]) if set(bins) != busco_bins and len(busco_bins.intersection(set(bins))) > 0: - warnings.warn("Bins in BUSCO summary do not match bins in bin depths summary") + warnings.warn( + "Bins in BUSCO summary do not match bins in bin depths summary" + ) elif len(busco_bins.intersection(set(bins))) == 0: sys.exit("Bins in BUSCO summary do not match bins in bin depths summary!") + busco_results = busco_results.add_suffix("_busco") results = pd.merge( - results, busco_results, left_on="bin", right_on="Input_file", how="outer" + results, + busco_results, + left_on="bin", + right_on="Input_file_busco", + how="outer", ) # assuming depths for all bins are given - if args.binqc_summary and args.binqc_tool == "checkm": + ## CHECKM PROCESSING + + if args.checkm_summary: use_columns = [ "Bin Id", "Marker lineage", @@ -141,16 +177,23 @@ def main(args=None): "4", "5+", ] - checkm_results = pd.read_csv(args.binqc_summary, usecols=use_columns, sep="\t") + checkm_results = pd.read_csv(args.checkm_summary, usecols=use_columns, sep="\t") checkm_results["Bin Id"] = checkm_results["Bin Id"] + ".fa" if not set(checkm_results["Bin Id"]).issubset(set(bins)): sys.exit("Bins in CheckM summary do not match bins in bin depths summary!") + checkm_results = checkm_results.add_suffix("_checkm") results = pd.merge( - results, checkm_results, left_on="bin", right_on="Bin Id", how="outer" + results, + checkm_results, + left_on="bin", + right_on="Bin Id_checkm", + how="outer", ) # assuming depths for all bins are given - results["Bin Id"] = results["Bin Id"].str.removesuffix(".fa") + results["Bin Id_checkm"] = results["Bin Id_checkm"].str.removesuffix(".fa") + + ## CHECKM2 PROCESSING - if args.binqc_summary and args.binqc_tool == "checkm2": + if args.checkm2_summary: use_columns = [ "Name", "Completeness", @@ -160,44 +203,69 @@ def main(args=None): "Translation_Table_Used", "Total_Coding_Sequences", ] - checkm2_results = pd.read_csv(args.binqc_summary, usecols=use_columns, sep="\t") + checkm2_results = pd.read_csv( + args.checkm2_summary, usecols=use_columns, sep="\t" + ) checkm2_results["Name"] = checkm2_results["Name"] + ".fa" if not set(checkm2_results["Name"]).issubset(set(bins)): sys.exit("Bins in CheckM2 summary do not match bins in bin depths summary!") + checkm2_results = checkm2_results.add_suffix("_checkm2") results = pd.merge( - results, checkm2_results, left_on="bin", right_on="Name", how="outer" + results, + checkm2_results, + left_on="bin", + right_on="Name_checkm2", + how="outer", ) # assuming depths for all bins are given - results["Name"] = results["Name"].str.removesuffix(".fa") + results["Name"] = results["Name_checkm2"].str.removesuffix(".fa") + + ## QUAST PROCESSING if args.quast_summary: quast_results = pd.read_csv(args.quast_summary, sep="\t") - if not bins.equals(quast_results["Assembly"].sort_values().reset_index(drop=True)): + if not bins.equals( + quast_results["Assembly"].sort_values().reset_index(drop=True) + ): sys.exit("Bins in QUAST summary do not match bins in bin depths summary!") + quast_results = quast_results.add_suffix("_quast") results = pd.merge( - results, quast_results, left_on="bin", right_on="Assembly", how="outer" + results, + quast_results, + left_on="bin", + right_on="Assembly_quast", + how="outer", ) # assuming depths for all bins are given + ## GTDBTK PROCESSING + if args.gtdbtk_summary: gtdbtk_results = pd.read_csv(args.gtdbtk_summary, sep="\t") if len(set(gtdbtk_results["user_genome"].to_list()).difference(set(bins))) > 0: sys.exit("Bins in GTDB-Tk summary do not match bins in bin depths summary!") + gtdbtk_results = gtdbtk_results.add_suffix("_gtdbtk") results = pd.merge( - results, gtdbtk_results, left_on="bin", right_on="user_genome", how="outer" + results, + gtdbtk_results, + left_on="bin", + right_on="user_genome_gtdbtk", + how="outer", ) # assuming depths for all bins are given + ## CAT_PACK PROCESSING + if args.cat_summary: cat_results = parse_cat_table(args.cat_summary) - if len(set(cat_results["bin"].to_list()).difference(set(bins))) > 0: + if len(set(cat_results["bin_catpack"].to_list()).difference(set(bins))) > 0: sys.exit("Bins in CAT summary do not match bins in bin depths summary!") results = pd.merge( results, - cat_results[["bin", "CAT_rank"]], + cat_results[["bin_catpack", "CAT_rank_catpack"]], left_on="bin", - right_on="bin", + right_on="bin_catpack", how="outer", ) - results.to_csv(args.out, sep="\t") + results.sort_values("bin").to_csv(args.out, sep="\t", index=False) if __name__ == "__main__": diff --git a/bin/create_metabinner_bins.py b/bin/create_metabinner_bins.py new file mode 100755 index 000000000..120a65187 --- /dev/null +++ b/bin/create_metabinner_bins.py @@ -0,0 +1,45 @@ +#!/usr/bin/env python + +## Originally written by Hesham Almessady (@HeshamAlmessady) and Adrian Fritz (@AlphaSquad) in https://github.com/hzi-bifo/mag and released under the MIT license. +## See git repository (https://github.com/nf-core/mag) for full license text. + +import sys +import os +from Bio import SeqIO + +def main(): + # Argument parsing + if len(sys.argv) != 6: + print("Usage: python create_metabinner_bins.py ") + sys.exit(1) + + binning = sys.argv[1] + fasta = sys.argv[2] + path = sys.argv[3] + prefix = sys.argv[4] + length = int(sys.argv[5]) + + # Create output directory if it doesn't exist + os.makedirs(path, exist_ok=True) + + # Load binning data into a dictionary + Metabinner_bins = {} + with open(binning, 'r') as b: + for line in b: + contig, bin = line.strip().split('\t') + Metabinner_bins[contig] = bin + + # Process the input fasta file + with open(fasta) as handle: + for record in SeqIO.parse(handle, "fasta"): + if len(record) < length: + f = prefix + ".tooShort.fa" + elif record.id not in Metabinner_bins: + f = prefix + ".unbinned.fa" + else: + f = prefix + "." + Metabinner_bins[record.id] + ".fa" + with open(os.path.join(path, f), 'a') as out: + SeqIO.write(record, out, "fasta") + +if __name__ == "__main__": + main() diff --git a/bin/get_mag_depths.py b/bin/get_mag_depths.py index 43ce35393..db3a75c9b 100755 --- a/bin/get_mag_depths.py +++ b/bin/get_mag_depths.py @@ -1,15 +1,14 @@ #!/usr/bin/env python -## Originally written by Sabrina Krakau and released under the MIT license. +## Originally written by Sabrina Krakau and updated by Diego Alvarez. Released under the MIT license. ## See git repository (https://github.com/nf-core/mag) for full license text. -import sys import argparse -import os.path -import pandas as pd import csv import gzip +import os.path import statistics +import sys from Bio import SeqIO @@ -29,17 +28,14 @@ def parse_args(args=None): "--depths", required=True, metavar="FILE", - help="(Compressed) TSV file containing contig depths for each sample: contigName, contigLen, totalAvgDepth, sample1_avgDepth, sample1_var [, sample2_avgDepth, sample2_var, ...].", - ) - parser.add_argument( - "-a", "--assembler", required=True, type=str, help="Assembler name." - ) - parser.add_argument( - "-i", "--id", required=True, type=str, help="Sample or group id." - ) - parser.add_argument( - "-m", "--binner", required=True, type=str, help="Binning method." + help=( + "Produces (compressed) TSV file containing contig depths for each sample: contigName, contigLen, " + + "totalAvgDepth, sample1_avgDepth, sample1_var [, sample2_avgDepth, sample2_var, ...]." + ), ) + parser.add_argument("-a", "--assembler", required=True, type=str, help="Assembler name.") + parser.add_argument("-i", "--id", required=True, type=str, help="Sample or group id.") + parser.add_argument("-m", "--binner", required=True, type=str, help="Binning method.") return parser.parse_args(args) @@ -53,57 +49,48 @@ def main(args=None): sample_names = [] dict_contig_depths = {} with gzip.open(args.depths, "rt") as infile: - reader = csv.reader(infile, delimiter="\t") - # process header - header = next(reader) - for sample in range(int((len(header) - 3) / 2)): - col_name = header[3 + 2 * sample] + reader = csv.DictReader(infile, delimiter="\t") + # process header to extract sample names from column names + depth_columns = [] + for col_name in reader.fieldnames[3::2]: # Every 2nd column starting from index 3 # retrieve sample name: "--.bam" sample_name = col_name[len(args.assembler) + 1 + len(args.id) + 1 : -4] sample_names.append(sample_name) + depth_columns.append(col_name) # process contig depths for row in reader: - contig_depths = [] - for sample in range(int((len(row) - 3) / 2)): - contig_depths.append(float(row[3 + 2 * sample])) - dict_contig_depths[str(row[0])] = contig_depths - - # Initialize output files - n_samples = len(sample_names) - with open( - args.assembler + "-" + args.binner + "-" + args.id + "-binDepths.tsv", "w" - ) as outfile: - print("bin", "\t".join(sample_names), sep="\t", file=outfile) - - # for each bin, access contig depths and compute mean bin depth (for all samples) - for file in args.bins: - all_depths = [[] for i in range(n_samples)] - - if file.endswith(".gz"): - with gzip.open(file, "rt") as infile: - for rec in SeqIO.parse(infile, "fasta"): - contig_depths = dict_contig_depths[rec.id] - for sample in range(n_samples): - all_depths[sample].append(contig_depths[sample]) - else: - with open(file, "rt") as infile: - for rec in SeqIO.parse(infile, "fasta"): - contig_depths = dict_contig_depths[rec.id] - for sample in range(n_samples): - all_depths[sample].append(contig_depths[sample]) - - binname = os.path.basename(file) - with open( - args.assembler + "-" + args.binner + "-" + args.id + "-binDepths.tsv", "a" - ) as outfile: - print( - binname, - "\t".join( - str(statistics.median(sample_depths)) - for sample_depths in all_depths - ), - sep="\t", - file=outfile, + contig_depths = {} + for sample_name, col_name in zip(sample_names, depth_columns): + contig_depths[sample_name] = float(row[col_name]) + dict_contig_depths[row[reader.fieldnames[0]]] = contig_depths + + sample_names = sorted(sample_names) + + with open(f"{args.assembler}-{args.binner}-{args.id}-binDepths.tsv", "w") as outfile: + writer = csv.writer(outfile, delimiter="\t") + writer.writerow(["bin", *sample_names]) + + # for each bin, access contig depths and compute mean bin depth (for all samples) + for file in args.bins: + all_depths = {sample: [] for sample in sample_names} + + if file.endswith(".gz"): + with gzip.open(file, "rt") as infile: + for rec in SeqIO.parse(infile, "fasta"): + contig_depths = dict_contig_depths[rec.id] + for sample in sample_names: + all_depths[sample].append(contig_depths[sample]) + else: + with open(file, "rt") as infile: + for rec in SeqIO.parse(infile, "fasta"): + contig_depths = dict_contig_depths[rec.id] + for sample in sample_names: + all_depths[sample].append(contig_depths[sample]) + + binname = os.path.basename(file) + + writer.writerow( + [binname, *[statistics.median(all_depths[sample]) for sample in sample_names]], ) diff --git a/bin/get_mag_depths_summary.py b/bin/get_mag_depths_summary.py index 694333715..bd803922b 100755 --- a/bin/get_mag_depths_summary.py +++ b/bin/get_mag_depths_summary.py @@ -37,7 +37,7 @@ def main(args=None): assembly_results = pd.read_csv(assembly_depths_file, index_col="bin", sep="\t") results = results.append(assembly_results, sort=True, verify_integrity=True) - results.to_csv(args.out, sep="\t") + results.sort_values("bin").to_csv(args.out, sep="\t") if __name__ == "__main__": diff --git a/conf/base.config b/conf/base.config index 62eba756c..077f976ed 100644 --- a/conf/base.config +++ b/conf/base.config @@ -169,6 +169,9 @@ process { withName: COMEBIN_RUNCOMEBIN { errorStrategy = { task.exitStatus in [1, 255] ? 'ignore' : 'retry' } } + withName: METABINNER_METABINNER { + errorStrategy = { task.exitStatus in [1, 255] ? 'ignore' : 'retry' } + } withName: DASTOOL_DASTOOL { errorStrategy = { task.exitStatus in ((130..145) + 104 + 175) ? 'retry' : task.exitStatus == 1 ? 'ignore' : 'finish' } } diff --git a/conf/modules.config b/conf/modules.config index ab0237cc2..935db8178 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -463,12 +463,13 @@ process { } withName: BUSCO_BUSCO { + tag = { "${meta.assembler}-${meta.binner}-${meta.id}" } ext.args = [ params.busco_db ? '--offline' : '' ].join(' ').trim() publishDir = [ [ - path: { "${params.outdir}/GenomeBinning/QC/BUSCO/${meta.id}" }, + path: { "${params.outdir}/GenomeBinning/QC/BUSCO/${meta.assembler}-${meta.binner}-${meta.id}" }, mode: params.publish_dir_mode, pattern: "*{.txt,.json,.log,-busco}", ], @@ -512,8 +513,8 @@ process { ] } - withName: CONCAT_BINQC_TSV { - ext.prefix = { "${params.binqc_tool}_summary" } + withName: 'CONCAT_BUSCO_TSV|CONCAT_CHECKM_TSV|CONCAT_CHECKM2_TSV' { + ext.prefix = { "${meta.id}_summary" } publishDir = [ path: { "${params.outdir}/GenomeBinning/QC" }, mode: params.publish_dir_mode, @@ -532,6 +533,7 @@ process { } withName: CHECKM2_PREDICT { + tag = { "${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}" } ext.prefix = { "${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}" } publishDir = [ path: { "${params.outdir}/GenomeBinning/QC/CheckM2" }, @@ -639,6 +641,7 @@ process { } withName: GTDBTK_CLASSIFYWF { + tag = { "${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}" } ext.args = [ "--extension fa", "--min_perc_aa ${params.gtdbtk_min_perc_aa}", @@ -822,6 +825,52 @@ process { ext.prefix = { "${meta.assembler}-COMEBin-${meta.id}" } } + withName: METABINNER_KMER { + ext.prefix = { "${meta.assembler}-MetaBinner-${meta.id}" } + } + + withName: METABINNER_TOOSHORT { + ext.prefix = { "${meta.assembler}-MetaBinner-${meta.id}" } + } + + withName: METABINNER_METABINNER { + publishDir = [ + [ + path: { "${params.outdir}/GenomeBinning/MetaBinner/stats" }, + mode: params.publish_dir_mode, + pattern: '*.{log,log.gz,tsv.gz}', + ] + ] + ext.prefix = { "${meta.assembler}-MetaBinner-${meta.id}" } + ext.args = { "-s ${params.bin_metabinner_scale}" } + } + + withName: METABINNER_BINS { + publishDir = [ + [ + path: { "${params.outdir}/GenomeBinning/MetaBinner/" }, + mode: params.publish_dir_mode, + pattern: 'bins/*.fa.gz', + ], + [ + path: { "${params.outdir}/GenomeBinning/MetaBinner/discarded" }, + mode: params.publish_dir_mode, + pattern: '*tooShort.fa.gz', + ], + [ + path: { "${params.outdir}/GenomeBinning/MetaBinner/discarded" }, + mode: params.publish_dir_mode, + pattern: '*lowDepth.fa.gz', + ], + [ + path: { "${params.outdir}/GenomeBinning/MetaBinner/unbinned" }, + mode: params.publish_dir_mode, + pattern: '*unbinned.fa.gz', + ], + ] + ext.prefix = { "${meta.assembler}-MetaBinner-${meta.id}" } + } + withName: SEQKIT_STATS { ext.args = "" publishDir = [enabled: false] @@ -873,7 +922,7 @@ process { [ path: { "${params.outdir}/GenomeBinning/DASTool/bins" }, mode: params.publish_dir_mode, - pattern: '*-{MetaBAT2,MaxBin2,CONCOCT}Refined-*.fa', + pattern: '*Refined-*.fa', ], ] } diff --git a/conf/test.config b/conf/test.config index 12e2b9bf7..3da0b5b24 100644 --- a/conf/test.config +++ b/conf/test.config @@ -29,11 +29,13 @@ params { // Including (even length filtered) CONOCT bins adds another 5 minutes, so we skip it in the default test (testing in assemblyinput) skip_concoct = true skip_comebin = true + skip_metabinner = true busco_db = params.pipelines_testdata_base_path + 'mag/databases/busco/bacteria_odb10.2024-01-08.tar.gz' busco_db_lineage = 'bacteria_odb10' busco_clean = true // Prokka is the slowest step of the tests, so we speed up by turning off CDS/product searching prokka_fast_mode = true + // Source: https://data.ace.uq.edu.au/public/gtdb/data/releases/latest/auxillary_files/gtdbtk_package/mockup_db/ gtdb_db = params.pipelines_testdata_base_path + 'mag/databases/gtdbtk/gtdbtk_mockup_20250422.tar.gz' cat_db = params.pipelines_testdata_base_path + 'mag/databases/cat/minigut_cat.tar.gz' cat_no_suggestive_asterisks = true diff --git a/conf/test_alternatives.config b/conf/test_alternatives.config index 1ec74a3a4..783c7c9d5 100644 --- a/conf/test_alternatives.config +++ b/conf/test_alternatives.config @@ -27,16 +27,20 @@ params { // Input data input = params.pipelines_testdata_base_path + 'mag/samplesheets/samplesheet.v4.csv' clip_tool = 'trimmomatic' - binqc_tool = 'checkm2' + run_busco = true + busco_clean = true + run_checkm2 = true bin_domain_classification = true skip_spades = true skip_quast = true skip_prodigal = true skip_prokka = true skip_gtdbtk = true + gtdbtk_skip_aniscreen = true skip_maxbin2 = true skip_concoct = true skip_comebin = true + skip_metabinner = true skip_metaeuk = true megahit_fix_cpu_1 = true } diff --git a/conf/test_assembly_input.config b/conf/test_assembly_input.config index 9398903e9..b6643ea61 100644 --- a/conf/test_assembly_input.config +++ b/conf/test_assembly_input.config @@ -41,8 +41,10 @@ params { skip_prodigal = true skip_prokka = true skip_gtdbtk = true + gtdbtk_skip_aniscreen = true skip_concoct = false skip_comebin = true + skip_metabinner = true refine_bins_dastool = true refine_bins_dastool_threshold = 0.0 diff --git a/conf/test_full.config b/conf/test_full.config index e228c56e9..aecc01751 100644 --- a/conf/test_full.config +++ b/conf/test_full.config @@ -40,7 +40,8 @@ params { // Skip CONCOCT due to timeout issues skip_concoct = true - binqc_tool = "checkm2" + run_checkm2 = true + run_busco = false // Set Prokka compliance mode to allow metaSPAdes bins to be annotated prokka_with_compliance = true @@ -264,4 +265,4 @@ process { maxRetries = 2 } -aws.client.anonymous = true +aws.client.anonymous = false diff --git a/conf/test_hybrid.config b/conf/test_hybrid.config index 108a719f4..4b7e7f8ac 100644 --- a/conf/test_hybrid.config +++ b/conf/test_hybrid.config @@ -37,8 +37,10 @@ params { skip_flye = true skip_metamdbg = true skip_gtdbtk = true + gtdbtk_skip_aniscreen = true skip_concoct = true skip_comebin = true + skip_metabinner = true spadeshybrid_fix_cpus = 2 } diff --git a/conf/test_longreadonly.config b/conf/test_longreadonly.config index 93614bacb..d6b8e3afb 100644 --- a/conf/test_longreadonly.config +++ b/conf/test_longreadonly.config @@ -36,6 +36,8 @@ params { busco_db_lineage = 'bacteria_odb10' skip_gtdbtk = true + gtdbtk_skip_aniscreen = true skip_concoct = true skip_comebin = true + skip_metabinner = true } diff --git a/conf/test_longreadonly_alternatives.config b/conf/test_longreadonly_alternatives.config index df6a302fb..5d4672735 100644 --- a/conf/test_longreadonly_alternatives.config +++ b/conf/test_longreadonly_alternatives.config @@ -36,7 +36,9 @@ params { skip_prodigal = true skip_prokka = true skip_gtdbtk = true + gtdbtk_skip_aniscreen = true skip_concoct = true skip_comebin = true + skip_metabinner = true skip_metaeuk = true } diff --git a/conf/test_minimal.config b/conf/test_minimal.config index 7af9972c3..1189c3383 100644 --- a/conf/test_minimal.config +++ b/conf/test_minimal.config @@ -21,8 +21,8 @@ process { } params { - config_profile_name = 'Test nothing profile' - config_profile_description = 'Minimal test dataset to check pipeline function' + config_profile_name = 'Test nothing profile' + config_profile_description = 'Minimal test dataset to check pipeline function' // Input data input = params.pipelines_testdata_base_path + 'mag/samplesheets/samplesheet.v4.csv' @@ -41,10 +41,15 @@ params { skip_maxbin2 = true skip_concoct = true skip_comebin = true + skip_metabinner = true skip_prokka = true skip_binqc = true + run_busco = false + run_checkm = false + run_checkm2 = false skip_gtdbtk = true - skip_ancient_damagecorrection = true gtdbtk_min_completeness = 0.01 + gtdbtk_skip_aniscreen = true skip_metaeuk = true + skip_ancient_damagecorrection = true } diff --git a/conf/test_single_end.config b/conf/test_single_end.config index e7e60d815..553ff12f7 100644 --- a/conf/test_single_end.config +++ b/conf/test_single_end.config @@ -40,14 +40,19 @@ params { bcftools_view_medium_variant_quality = 0 bcftools_view_minimal_allelesupport = 3 skip_comebin = true + skip_metabinner = true min_length_unbinned_contigs = 1000000 max_unbinned_contigs = 2 - binqc_tool = 'checkm' + run_busco = false + run_checkm = true run_virus_identification = true genomad_splits = 7 // micro_db not compatible with current genNomad version - genomad_db = null // 'https://zenodo.org/records/11945948/files/genomad_microdb.tar.gz' + genomad_db = null + // 'https://zenodo.org/records/11945948/files/genomad_microdb.tar.gz' gtdb_db = params.pipelines_testdata_base_path + 'mag/databases/gtdbtk/gtdbtk_mockup_20250422.tar.gz' + gtdbtk_min_completeness = 3 + gtdbtk_skip_aniscreen = true cat_db = params.pipelines_testdata_base_path + 'mag/databases/cat/minigut_cat.tar.gz' cat_no_suggestive_asterisks = true } diff --git a/docs/images/mag_metromap_dark.png b/docs/images/mag_metromap_dark.png index 71bb4a120..07dafaae2 100644 Binary files a/docs/images/mag_metromap_dark.png and b/docs/images/mag_metromap_dark.png differ diff --git a/docs/images/mag_metromap_dark.svg b/docs/images/mag_metromap_dark.svg index 6c7a0f43e..cb0eb26fa 100644 --- a/docs/images/mag_metromap_dark.svg +++ b/docs/images/mag_metromap_dark.svg @@ -28,8 +28,8 @@ inkscape:document-units="mm" showgrid="false" inkscape:zoom="0.825266" - inkscape:cx="703.40957" - inkscape:cy="615.5591" + inkscape:cx="885.77501" + inkscape:cy="545.27873" inkscape:current-layer="g9" showguides="true">v5.1v5.2MetaBat2 /MaxBin2 /CONCOCT/COMEBinCOMEBin /MetaBinnerPROKKAMetaEukBUSCO / CheckM / CheckM2BUSCO / CheckM / CheckM2