diff --git a/CHANGELOG.md b/CHANGELOG.md index e39c9d1ee..6ae0249a8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -41,6 +41,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - [#873](https://github.com/nf-core/mag/pull/873) - Document usage of `longread_percentidentity` and `shortread_percentidentity` and set the value of `longread_percentidentity` in the `test_full` profile to 85 (by @prototaxites) - [#875](https://github.com/nf-core/mag/pull/875) - Add binner COMEBin (by @d4straub) +- [#931](https://github.com/nf-core/mag/pull/931) - Added ALE (Assembly Likelihood Estimator) for probabilistic assembly quality control (by @PetcuBogdan) + - ALE provides per-contig quality scores for short-read assemblies (SPAdes, MEGAHIT) + - Runs automatically when binning is enabled (default behavior) + - Output: `Assembly/[assembler]/QC/[sample]/ALE/` + - Can be disabled with `--skip_ale` parameter + ### `Changed` - [#878](https://github.com/nf-core/mag/pull/878) - Refine test_full config with optimised resource usage for AWS release megatests (by @jfy133) diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml index b7ca4dbbb..2adf3efac 100644 --- a/assets/multiqc_config.yml +++ b/assets/multiqc_config.yml @@ -21,6 +21,7 @@ run_modules: - bowtie2 - busco - quast + - ale - prokka - porechop - filtlong @@ -55,6 +56,13 @@ top_modules: info: "Mapping statistics of reads mapped against host genome and subsequently removed." path_filters: - "*_host_removed.bowtie2.log" + - "ale": + name: "ALE: Assembly Likelihood Evaluation" + info: "Log-likelihood evaluation of assemblies using mapped reads (ALE module)." + path_filters: + - "*_ALE/*.ale" + - "*_ALE/*.txt" + - "*_ALE/*.log" - "quast": name: "QUAST: assembly" info: "Assembly statistics of raw assemblies." diff --git a/conf/modules.config b/conf/modules.config index 935db8178..763aa5a6c 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -407,6 +407,16 @@ process { publishDir = [path: { "${params.outdir}/Assembly/${meta.assembler}/QC/${meta.id}" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename }] } + withName: 'NFCORE_MAG:MAG:ALE' { + publishDir = [ + path: { "${params.outdir}/Assembly/${meta.assembler?.toUpperCase() ?: 'UNKNOWN'}/QC/${meta.id}/ALE" }, + mode: params.publish_dir_mode, + pattern: "*.{ale,txt,log}", + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + ext.prefix = { "${meta.id}" } + } + withName: 'QUAST_BINS|QUAST_BINS_SUMMARY' { publishDir = [ path: { "${params.outdir}/GenomeBinning/QC" }, diff --git a/docs/output.md b/docs/output.md index 0f9fcc505..255ae527a 100644 --- a/docs/output.md +++ b/docs/output.md @@ -300,6 +300,21 @@ SPAdesHybrid is a part of the [SPAdes](http://cab.spbu.ru/software/spades/) soft +### Assembly Quality Control with ALE + +[ALE (Assembly Likelihood Estimator)](https://github.com/sc932/ALE) is a probabilistic framework that evaluates assembly quality by computing the likelihood of the sequencing reads given an assembly. ALE provides per-contig quality scores and identifies potentially problematic regions in assemblies by analyzing read mapping patterns and insert size distributions. It is particularly useful for comparing assemblies and identifying misassemblies or low-confidence regions. + +ALE is run on short-read assemblies (SPAdes, SPAdes hybrid, and MEGAHIT) when binning or ancient DNA analysis is enabled. + +
+Output files + +- `Assembly/[assembler]/QC/[sample/group]/ALE/` + - `[sample]_ALEoutput.txt`: Per-contig ALE scores and quality metrics, including likelihood estimates for each contig + - `[sample].log`: ALE processing log file containing diagnostic information and runtime details + +
+ ## Gene prediction Protein-coding genes are predicted for each assembly. diff --git a/docs/usage.md b/docs/usage.md index ed0845cb2..7d7140b0a 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -454,6 +454,20 @@ This can also remove 'nonsense' bins of e.g. a single or a collection of very sh Note that in this context, it is recommended to also set `--min_length_unbinned_contigs` to a suitably high value that corresponds to a reasonable bin size if the `-bin_*_length` parameters are used, so you have useful 'singular' contigs in the unbinned output. +## A note on assembly quality control with ALE + +The pipeline uses [ALE (Assembly Likelihood Estimator)](https://github.com/sc932/ALE) to perform probabilistic quality assessment of short-read assemblies generated by MEGAHIT and SPAdes. + +ALE evaluates assembly quality by computing the likelihood that the assembly could have generated the observed sequencing reads. Unlike traditional assembly QC tools that rely on reference genomes or marker genes, ALE provides a reference-free quality assessment that is particularly useful for novel organisms or complex metagenomes where references may not be available. + +ALE runs automatically when binning is enabled (default behavior), short reads are provided, and assemblies are generated with MEGAHIT or SPAdes. The tool generates quality assessment files in `Assembly/[assembler]/QC/[sample]/ALE/` containing per-assembly likelihood metrics (`[sample]_ALEoutput.txt`). + +ALE scores are log-likelihoods where higher (less negative) values indicate better assembly quality. These scores reflect how well the assembly explains the observed sequencing reads and can help identify assemblies that may have structural issues or errors that could affect downstream binning and annotation. + +If you wish to skip the ALE quality assessment step (for example, to speed up the pipeline when working with well-characterized samples), you can disable it with `--skip_ale`. + +Note that ALE only works with short-read assemblies (MEGAHIT, SPAdes). Long-read assemblies (Flye, MetaMDBG) are not supported by ALE, and hybrid assemblies (SPAdesHybrid) use only the short-read component for ALE scoring. For more information about ALE and how to interpret the results, see the [ALE GitHub repository](https://github.com/sc932/ALE) and the [publication](https://doi.org/10.1093/bioinformatics/bts723). + ## A note on GTDB having too many files or using too many inodes The GTDB is very large both in size and by the number of files it contains. diff --git a/modules.json b/modules.json index 078b3ba25..0ffb910fb 100644 --- a/modules.json +++ b/modules.json @@ -10,6 +10,11 @@ "git_sha": "41dfa3f7c0ffabb96a6a813fe321c6d1cc5b6e46", "installed_by": ["modules"] }, + "ale": { + "branch": "master", + "git_sha": "41dfa3f7c0ffabb96a6a813fe321c6d1cc5b6e46", + "installed_by": ["modules"] + }, "bbmap/bbnorm": { "branch": "master", "git_sha": "41dfa3f7c0ffabb96a6a813fe321c6d1cc5b6e46", diff --git a/modules/nf-core/ale/environment.yml b/modules/nf-core/ale/environment.yml new file mode 100644 index 000000000..dc5a46e06 --- /dev/null +++ b/modules/nf-core/ale/environment.yml @@ -0,0 +1,9 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda + - tanghaibao +dependencies: + # renovate: datasource=conda depName=bioconda/ale + - bioconda::ale=20180904 diff --git a/modules/nf-core/ale/main.nf b/modules/nf-core/ale/main.nf new file mode 100644 index 000000000..f70d468a7 --- /dev/null +++ b/modules/nf-core/ale/main.nf @@ -0,0 +1,49 @@ +process ALE { + tag "$meta.id" + label 'process_single' + + // WARN: Version information not provided by tool on CLI. Please update version string below when bumping container versions. + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/ale:20180904--py27ha92aebf_0': + 'biocontainers/ale:20180904--py27ha92aebf_0' }" + + input: + tuple val(meta), path(asm), path(bam) + + output: + tuple val(meta), path("*_ALEoutput.txt"), emit: ale + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def VERSION = '20180904' // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions. + """ + ALE \\ + ${args} \\ + ${bam} \\ + ${asm} \\ + ${prefix}_ALEoutput.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + ale: $VERSION + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + def VERSION = '20180904' // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions. + """ + touch ${prefix}_ALEoutput.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + ale: $VERSION + END_VERSIONS + """ +} diff --git a/modules/nf-core/ale/meta.yml b/modules/nf-core/ale/meta.yml new file mode 100644 index 000000000..12c26cfc9 --- /dev/null +++ b/modules/nf-core/ale/meta.yml @@ -0,0 +1,58 @@ +name: "ale" +description: "ALE: assembly likelihood estimator." +keywords: + - reference-independent + - assembly + - evaluation +tools: + - "ale": + description: "ALE is a generic assembly likelihood evaluation framework for assessing + the accuracy of genome and metagenome assemblies." + documentation: "https://portal.nersc.gov/dna/RD/Adv-Seq/ALE-doc/index.html#document-install" + tool_dev_url: "https://github.com/sc932/ALE" + doi: "10.1093/bioinformatics/bts723" + licence: ["NCSA"] + identifier: "" + +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + - asm: + type: file + description: Assembly in FASTA format + pattern: "*.{fasta,fa}" + ontologies: + - edam: "http://edamontology.org/format_1929" # FASTA + - bam: + type: file + description: BAM file containing sorted read mappings + pattern: "*.{bam}" + ontologies: + - edam: "http://edamontology.org/format_2572" # BAM +output: + ale: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + - "*_ALEoutput.txt": + type: file + description: Output TXT file containing ALE results + pattern: "*_ALEoutput.{txt}" + ontologies: + - edam: "http://edamontology.org/format_2330" # Textual format + versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" + ontologies: + - edam: "http://edamontology.org/format_3750" # YAML +authors: + - "@rodtheo" +maintainers: + - "@rodtheo" diff --git a/modules/nf-core/ale/tests/main.nf.test b/modules/nf-core/ale/tests/main.nf.test new file mode 100644 index 000000000..87c5cfa65 --- /dev/null +++ b/modules/nf-core/ale/tests/main.nf.test @@ -0,0 +1,108 @@ +// nf-core modules test ale +nextflow_process { + + name "Test Process ALE" + script "../main.nf" + process "ALE" + + tag "modules" + tag "modules_nfcore" + tag "ale" + + test("sarscov2 [fasta] - paired-end sorted bam") { + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.bam', checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() }, + { assert path(process.out.ale[0][1]).readLines().first().contains("ALE_score") } + ) + } + + } + + test("sarscov2 [fasta_gz] - paired-end sorted bam") { + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.bam', checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() }, + { assert path(process.out.ale[0][1]).readLines().first().contains("ALE_score") } + ) + } + + } + + test("sarscov2 [fasta] - single-end sorted bam") { + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:true ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.single_end.bam', checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() }, + { assert path(process.out.ale[0][1]).readLines().first().contains("ALE_score") } + ) + } + + } + + test("sarscov2 [fasta_gz] - single-end sorted bam") { + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:true ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.single_end.bam', checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() }, + { assert path(process.out.ale[0][1]).readLines().first().contains("ALE_score") } + ) + } + + } + +} diff --git a/modules/nf-core/ale/tests/main.nf.test.snap b/modules/nf-core/ale/tests/main.nf.test.snap new file mode 100644 index 000000000..64b3e2c77 --- /dev/null +++ b/modules/nf-core/ale/tests/main.nf.test.snap @@ -0,0 +1,142 @@ +{ + "sarscov2 [fasta_gz] - paired-end sorted bam": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test_ALEoutput.txt:md5,4abcbd60ae1dbf78138c97e5fed97f3e" + ] + ], + "1": [ + "versions.yml:md5,949da9c6297b613b50e24c421576f3f1" + ], + "ale": [ + [ + { + "id": "test", + "single_end": false + }, + "test_ALEoutput.txt:md5,4abcbd60ae1dbf78138c97e5fed97f3e" + ] + ], + "versions": [ + "versions.yml:md5,949da9c6297b613b50e24c421576f3f1" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-03-19T09:06:19.589167" + }, + "sarscov2 [fasta] - paired-end sorted bam": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test_ALEoutput.txt:md5,4abcbd60ae1dbf78138c97e5fed97f3e" + ] + ], + "1": [ + "versions.yml:md5,949da9c6297b613b50e24c421576f3f1" + ], + "ale": [ + [ + { + "id": "test", + "single_end": false + }, + "test_ALEoutput.txt:md5,4abcbd60ae1dbf78138c97e5fed97f3e" + ] + ], + "versions": [ + "versions.yml:md5,949da9c6297b613b50e24c421576f3f1" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-03-19T09:06:11.683035" + }, + "sarscov2 [fasta_gz] - single-end sorted bam": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": true + }, + "test_ALEoutput.txt:md5,fc2e4c521d61c35d69f74ed8294493fb" + ] + ], + "1": [ + "versions.yml:md5,949da9c6297b613b50e24c421576f3f1" + ], + "ale": [ + [ + { + "id": "test", + "single_end": true + }, + "test_ALEoutput.txt:md5,fc2e4c521d61c35d69f74ed8294493fb" + ] + ], + "versions": [ + "versions.yml:md5,949da9c6297b613b50e24c421576f3f1" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-03-19T09:06:35.914024" + }, + "sarscov2 [fasta] - single-end sorted bam": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": true + }, + "test_ALEoutput.txt:md5,fc2e4c521d61c35d69f74ed8294493fb" + ] + ], + "1": [ + "versions.yml:md5,949da9c6297b613b50e24c421576f3f1" + ], + "ale": [ + [ + { + "id": "test", + "single_end": true + }, + "test_ALEoutput.txt:md5,fc2e4c521d61c35d69f74ed8294493fb" + ] + ], + "versions": [ + "versions.yml:md5,949da9c6297b613b50e24c421576f3f1" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-03-19T09:06:27.781196" + } +} \ No newline at end of file diff --git a/nextflow.config b/nextflow.config index 1d5c0ae15..730d23dce 100644 --- a/nextflow.config +++ b/nextflow.config @@ -69,6 +69,7 @@ params { skip_spades = false skip_spadeshybrid = false skip_megahit = false + skip_ale = false skip_quast = false skip_prodigal = false skip_metamdbg = false diff --git a/nextflow_schema.json b/nextflow_schema.json index c14cf3743..7bb9d8398 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -629,6 +629,10 @@ "type": "boolean", "description": "Skip MEGAHIT assembly." }, + "skip_ale": { + "type": "boolean", + "description": "Skip ALE" + }, "skip_quast": { "type": "boolean", "description": "Skip metaQUAST." diff --git a/workflows/mag.nf b/workflows/mag.nf index 9e75b764c..fc760c852 100644 --- a/workflows/mag.nf +++ b/workflows/mag.nf @@ -34,6 +34,7 @@ include { PRODIGAL } from '../modules/nf-core/prodigal/ma include { PROKKA } from '../modules/nf-core/prokka/main' include { MMSEQS_DATABASES } from '../modules/nf-core/mmseqs/databases/main' include { METAEUK_EASYPREDICT } from '../modules/nf-core/metaeuk/easypredict/main' +include { ALE } from '../modules/nf-core/ale/main' // // MODULE: Local to the pipeline @@ -502,6 +503,47 @@ workflow MAG { } } + /* + ================================================================================ + ALE Analysis + ================================================================================ + */ + + if(!params.skip_ale) { + if ( !params.skip_binning || params.ancient_dna) { + ch_shortread_assemblies_for_ale = ch_assemblies.filter { meta, assembly -> + meta.assembler?.toUpperCase() in ['SPADES', 'SPADESHYBRID', 'MEGAHIT'] + } + + ch_ale_input = BINNING_PREPARATION.out.grouped_mappings + .join(ch_shortread_assemblies_for_ale, by: 0) + .map { meta, contigs, bam, bai, assembly -> + def actual_bam = bam instanceof List ? bam[0] : bam + [meta, assembly, actual_bam] + } + + ALE(ch_ale_input) + ch_versions = ch_versions.mix(ALE.out.versions.ifEmpty([])) + } + else { + log.warn """ + [nf-core/mag] ALE (Assembly Likelihood Estimator) Warnings + + ALE is enabled (--skip_ale false) but cannot run because: + - Binning is disabled (--skip_binning true) + - Ancient DNA mode is not enabled (--ancient_dna false) + + To run ALE, choose one of the following options: + + 1. Enable binning: --skip_binning false + 2. Enable ancient DNA: --ancient_dna true + 3. Disable ALE: --skip_ale true + + ALE evaluates assembly quality through read mapping analysis. + """.stripIndent() + } + } + // // Collate and save software versions //