From 0eeb3dfbd0a6d99aa030d45016106e56cd44ca9d Mon Sep 17 00:00:00 2001 From: PetcuBogdan Date: Thu, 23 Oct 2025 10:13:25 +0300 Subject: [PATCH 1/7] Add ale module and call ale from mag.nf --- assets/multiqc_config.yml | 8 ++ docs/output.md | 4 + modules.json | 5 + modules/nf-core/ale/environment.yml | 9 ++ modules/nf-core/ale/main.nf | 49 +++++++ modules/nf-core/ale/meta.yml | 58 ++++++++ modules/nf-core/ale/tests/main.nf.test | 108 +++++++++++++++ modules/nf-core/ale/tests/main.nf.test.snap | 142 ++++++++++++++++++++ nextflow.config | 1 + nextflow_schema.json | 4 + workflows/mag.nf | 17 +++ 11 files changed, 405 insertions(+) create mode 100644 modules/nf-core/ale/environment.yml create mode 100644 modules/nf-core/ale/main.nf create mode 100644 modules/nf-core/ale/meta.yml create mode 100644 modules/nf-core/ale/tests/main.nf.test create mode 100644 modules/nf-core/ale/tests/main.nf.test.snap diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml index bf86e4688..8be2e0c20 100644 --- a/assets/multiqc_config.yml +++ b/assets/multiqc_config.yml @@ -24,6 +24,7 @@ run_modules: - bowtie2 - busco - quast + - ale - prokka - porechop - filtlong @@ -59,6 +60,13 @@ top_modules: removed." path_filters: - "*_host_removed.bowtie2.log" + - "ale": + name: "ALE: Assembly Likelihood Evaluation" + info: "Log-likelihood evaluation of assemblies using mapped reads (ALE module)." + path_filters: + - "*_ALE/*.ale" + - "*_ALE/*.txt" + - "*_ALE/*.log" - "quast": name: "QUAST: assembly" info: "Assembly statistics of raw assemblies." diff --git a/docs/output.md b/docs/output.md index 32b46256c..556f56683 100644 --- a/docs/output.md +++ b/docs/output.md @@ -205,6 +205,10 @@ Trimmed (short) reads are assembled with both megahit and SPAdes. Hybrid assembl - `MEGAHIT-[sample].bowtie2.log`: Bowtie2 log file indicating how many reads have been mapped from the sample that the metagenome was assembled from, only present if `--coassemble_group` is not set. - `MEGAHIT-[sample/group]-[sampleToMap].bowtie2.log`: Bowtie2 log file indicating how many reads have been mapped from the respective sample ("sampleToMap"). - `MEGAHIT-[sample].[bam/bai]`: Optionally saved BAM file of the Bowtie2 mapping of reads against the assembly. + - `ALE/[sample/group]/`: Directory containing Assembly Likelihood Estimator (ALE) results + - `[sample].ale.txt`: ALE score file containing per-contig likelihood estimates + - `[sample].summary.tsv`: Summary statistics of ALE scores and assembly quality + - `[sample].log`: Log file produced by ALE during processing diff --git a/modules.json b/modules.json index 87f0f1b8c..22e9bbed3 100644 --- a/modules.json +++ b/modules.json @@ -10,6 +10,11 @@ "git_sha": "41dfa3f7c0ffabb96a6a813fe321c6d1cc5b6e46", "installed_by": ["modules"] }, + "ale": { + "branch": "master", + "git_sha": "41dfa3f7c0ffabb96a6a813fe321c6d1cc5b6e46", + "installed_by": ["modules"] + }, "bbmap/bbnorm": { "branch": "master", "git_sha": "41dfa3f7c0ffabb96a6a813fe321c6d1cc5b6e46", diff --git a/modules/nf-core/ale/environment.yml b/modules/nf-core/ale/environment.yml new file mode 100644 index 000000000..dc5a46e06 --- /dev/null +++ b/modules/nf-core/ale/environment.yml @@ -0,0 +1,9 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda + - tanghaibao +dependencies: + # renovate: datasource=conda depName=bioconda/ale + - bioconda::ale=20180904 diff --git a/modules/nf-core/ale/main.nf b/modules/nf-core/ale/main.nf new file mode 100644 index 000000000..f70d468a7 --- /dev/null +++ b/modules/nf-core/ale/main.nf @@ -0,0 +1,49 @@ +process ALE { + tag "$meta.id" + label 'process_single' + + // WARN: Version information not provided by tool on CLI. Please update version string below when bumping container versions. + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/ale:20180904--py27ha92aebf_0': + 'biocontainers/ale:20180904--py27ha92aebf_0' }" + + input: + tuple val(meta), path(asm), path(bam) + + output: + tuple val(meta), path("*_ALEoutput.txt"), emit: ale + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def VERSION = '20180904' // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions. + """ + ALE \\ + ${args} \\ + ${bam} \\ + ${asm} \\ + ${prefix}_ALEoutput.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + ale: $VERSION + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + def VERSION = '20180904' // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions. + """ + touch ${prefix}_ALEoutput.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + ale: $VERSION + END_VERSIONS + """ +} diff --git a/modules/nf-core/ale/meta.yml b/modules/nf-core/ale/meta.yml new file mode 100644 index 000000000..12c26cfc9 --- /dev/null +++ b/modules/nf-core/ale/meta.yml @@ -0,0 +1,58 @@ +name: "ale" +description: "ALE: assembly likelihood estimator." +keywords: + - reference-independent + - assembly + - evaluation +tools: + - "ale": + description: "ALE is a generic assembly likelihood evaluation framework for assessing + the accuracy of genome and metagenome assemblies." + documentation: "https://portal.nersc.gov/dna/RD/Adv-Seq/ALE-doc/index.html#document-install" + tool_dev_url: "https://github.com/sc932/ALE" + doi: "10.1093/bioinformatics/bts723" + licence: ["NCSA"] + identifier: "" + +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + - asm: + type: file + description: Assembly in FASTA format + pattern: "*.{fasta,fa}" + ontologies: + - edam: "http://edamontology.org/format_1929" # FASTA + - bam: + type: file + description: BAM file containing sorted read mappings + pattern: "*.{bam}" + ontologies: + - edam: "http://edamontology.org/format_2572" # BAM +output: + ale: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + - "*_ALEoutput.txt": + type: file + description: Output TXT file containing ALE results + pattern: "*_ALEoutput.{txt}" + ontologies: + - edam: "http://edamontology.org/format_2330" # Textual format + versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" + ontologies: + - edam: "http://edamontology.org/format_3750" # YAML +authors: + - "@rodtheo" +maintainers: + - "@rodtheo" diff --git a/modules/nf-core/ale/tests/main.nf.test b/modules/nf-core/ale/tests/main.nf.test new file mode 100644 index 000000000..87c5cfa65 --- /dev/null +++ b/modules/nf-core/ale/tests/main.nf.test @@ -0,0 +1,108 @@ +// nf-core modules test ale +nextflow_process { + + name "Test Process ALE" + script "../main.nf" + process "ALE" + + tag "modules" + tag "modules_nfcore" + tag "ale" + + test("sarscov2 [fasta] - paired-end sorted bam") { + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.bam', checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() }, + { assert path(process.out.ale[0][1]).readLines().first().contains("ALE_score") } + ) + } + + } + + test("sarscov2 [fasta_gz] - paired-end sorted bam") { + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.bam', checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() }, + { assert path(process.out.ale[0][1]).readLines().first().contains("ALE_score") } + ) + } + + } + + test("sarscov2 [fasta] - single-end sorted bam") { + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:true ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.single_end.bam', checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() }, + { assert path(process.out.ale[0][1]).readLines().first().contains("ALE_score") } + ) + } + + } + + test("sarscov2 [fasta_gz] - single-end sorted bam") { + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:true ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.single_end.bam', checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() }, + { assert path(process.out.ale[0][1]).readLines().first().contains("ALE_score") } + ) + } + + } + +} diff --git a/modules/nf-core/ale/tests/main.nf.test.snap b/modules/nf-core/ale/tests/main.nf.test.snap new file mode 100644 index 000000000..64b3e2c77 --- /dev/null +++ b/modules/nf-core/ale/tests/main.nf.test.snap @@ -0,0 +1,142 @@ +{ + "sarscov2 [fasta_gz] - paired-end sorted bam": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test_ALEoutput.txt:md5,4abcbd60ae1dbf78138c97e5fed97f3e" + ] + ], + "1": [ + "versions.yml:md5,949da9c6297b613b50e24c421576f3f1" + ], + "ale": [ + [ + { + "id": "test", + "single_end": false + }, + "test_ALEoutput.txt:md5,4abcbd60ae1dbf78138c97e5fed97f3e" + ] + ], + "versions": [ + "versions.yml:md5,949da9c6297b613b50e24c421576f3f1" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-03-19T09:06:19.589167" + }, + "sarscov2 [fasta] - paired-end sorted bam": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test_ALEoutput.txt:md5,4abcbd60ae1dbf78138c97e5fed97f3e" + ] + ], + "1": [ + "versions.yml:md5,949da9c6297b613b50e24c421576f3f1" + ], + "ale": [ + [ + { + "id": "test", + "single_end": false + }, + "test_ALEoutput.txt:md5,4abcbd60ae1dbf78138c97e5fed97f3e" + ] + ], + "versions": [ + "versions.yml:md5,949da9c6297b613b50e24c421576f3f1" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-03-19T09:06:11.683035" + }, + "sarscov2 [fasta_gz] - single-end sorted bam": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": true + }, + "test_ALEoutput.txt:md5,fc2e4c521d61c35d69f74ed8294493fb" + ] + ], + "1": [ + "versions.yml:md5,949da9c6297b613b50e24c421576f3f1" + ], + "ale": [ + [ + { + "id": "test", + "single_end": true + }, + "test_ALEoutput.txt:md5,fc2e4c521d61c35d69f74ed8294493fb" + ] + ], + "versions": [ + "versions.yml:md5,949da9c6297b613b50e24c421576f3f1" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-03-19T09:06:35.914024" + }, + "sarscov2 [fasta] - single-end sorted bam": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": true + }, + "test_ALEoutput.txt:md5,fc2e4c521d61c35d69f74ed8294493fb" + ] + ], + "1": [ + "versions.yml:md5,949da9c6297b613b50e24c421576f3f1" + ], + "ale": [ + [ + { + "id": "test", + "single_end": true + }, + "test_ALEoutput.txt:md5,fc2e4c521d61c35d69f74ed8294493fb" + ] + ], + "versions": [ + "versions.yml:md5,949da9c6297b613b50e24c421576f3f1" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-03-19T09:06:27.781196" + } +} \ No newline at end of file diff --git a/nextflow.config b/nextflow.config index 05334a48b..64786674e 100644 --- a/nextflow.config +++ b/nextflow.config @@ -69,6 +69,7 @@ params { skip_spades = false skip_spadeshybrid = false skip_megahit = false + skip_ale = false skip_quast = false skip_prodigal = false skip_metamdbg = false diff --git a/nextflow_schema.json b/nextflow_schema.json index 78466fe16..434fe6c7c 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -617,6 +617,10 @@ "type": "boolean", "description": "Skip MEGAHIT assembly." }, + "skip_ale": { + "type": "boolean", + "descrition": "Skip ALE" + }, "skip_quast": { "type": "boolean", "description": "Skip metaQUAST." diff --git a/workflows/mag.nf b/workflows/mag.nf index 7ecec653f..b67f96d53 100644 --- a/workflows/mag.nf +++ b/workflows/mag.nf @@ -34,6 +34,7 @@ include { PRODIGAL } from '../modules/nf-core/prodigal/ma include { PROKKA } from '../modules/nf-core/prokka/main' include { MMSEQS_DATABASES } from '../modules/nf-core/mmseqs/databases/main' include { METAEUK_EASYPREDICT } from '../modules/nf-core/metaeuk/easypredict/main' +include { ALE } from '../modules/nf-core/ale/main' // // MODULE: Local to the pipeline @@ -198,6 +199,22 @@ workflow MAG { ch_assemblies = ch_assemblies.mix(ch_assemblies_split.ungzip, GUNZIP_ASSEMBLYINPUT.out.gunzip) ch_shortread_assemblies = ch_assemblies.filter { it[0].assembler.toUpperCase() in ['SPADES', 'SPADESHYBRID', 'MEGAHIT'] } ch_longread_assemblies = ch_assemblies.filter { it[0].assembler.toUpperCase() in ['FLYE', 'METAMDBG'] } + + if(!params.skip_ale) { + // Create the pair list of read-assembl for ale + ch_assembly_mapping_pairs = ch_short_reads + .join(ch_shortread_assemblies) + .map { reads_tuple, assembly_tuple -> + def meta = reads_tuple[0] + def reads = reads_tuple[1] + def assembly_meta = assembly_tuple[0] + def assembly_file = assembly_tuple[1] + [[meta: meta, assembler: assembly_meta.assembler], reads, assembly_file] + } + + ALE(ch_assembly_mapping_pairs) + ch_versions = ch_versions.mix(ALE.out.versions) + } } if (!params.skip_quast) { From 785e7f4eb172ac0c2a7cbaf58dfd926fd37c9fe1 Mon Sep 17 00:00:00 2001 From: PetcuBogdan Date: Thu, 6 Nov 2025 21:32:33 +0200 Subject: [PATCH 2/7] add ale result output path --- conf/modules.config | 10 ++++++++ docs/output.md | 19 +++++++++++--- workflows/mag.nf | 60 +++++++++++++++++++++++++++++++++------------ 3 files changed, 69 insertions(+), 20 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 93a5ae75c..ca6096cd2 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -407,6 +407,16 @@ process { publishDir = [path: { "${params.outdir}/Assembly/${meta.assembler}/QC/${meta.id}" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename }] } + withName: 'NFCORE_MAG:MAG:ALE' { + publishDir = [ + path: { "${params.outdir}/Assembly/${meta.assembler?.toUpperCase() ?: 'UNKNOWN'}/QC/${meta.id}/ALE" }, + mode: params.publish_dir_mode, + pattern: "*.{ale.txt,log,ALEoutput.txt,txt}", + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + ] + ext.prefix = { "${meta.id}" } + } + withName: 'QUAST_BINS|QUAST_BINS_SUMMARY' { publishDir = [ path: { "${params.outdir}/GenomeBinning/QC" }, diff --git a/docs/output.md b/docs/output.md index 556f56683..d952a5470 100644 --- a/docs/output.md +++ b/docs/output.md @@ -205,10 +205,7 @@ Trimmed (short) reads are assembled with both megahit and SPAdes. Hybrid assembl - `MEGAHIT-[sample].bowtie2.log`: Bowtie2 log file indicating how many reads have been mapped from the sample that the metagenome was assembled from, only present if `--coassemble_group` is not set. - `MEGAHIT-[sample/group]-[sampleToMap].bowtie2.log`: Bowtie2 log file indicating how many reads have been mapped from the respective sample ("sampleToMap"). - `MEGAHIT-[sample].[bam/bai]`: Optionally saved BAM file of the Bowtie2 mapping of reads against the assembly. - - `ALE/[sample/group]/`: Directory containing Assembly Likelihood Estimator (ALE) results - - `[sample].ale.txt`: ALE score file containing per-contig likelihood estimates - - `[sample].summary.tsv`: Summary statistics of ALE scores and assembly quality - - `[sample].log`: Log file produced by ALE during processing + @@ -304,6 +301,20 @@ SPAdesHybrid is a part of the [SPAdes](http://cab.spbu.ru/software/spades/) soft +### Assembly Quality Control with ALE + +[ALE (Assembly Likelihood Estimator)](https://github.com/sc932/ALE) evaluates assembly quality by computing the likelihood of the sequencing reads given an assembly. ALE provides per-contig quality scores and identifies potentially problematic regions in the assembly by analyzing read mapping patterns and insert size distributions. + +
+Output files + +- `Assembly/[assembler]/QC/[sample/group]/ALE/` + - `[sample].ale.txt`: Per-contig ALE scores and quality metrics, including likelihood estimates for each contig + - `[sample].summary.tsv`: Summary statistics of assembly quality assessment with overall ALE scores and metrics + - `[sample].log`: ALE processing log file containing diagnostic information and runtime details + +
+ ## Gene prediction Protein-coding genes are predicted for each assembly. diff --git a/workflows/mag.nf b/workflows/mag.nf index b67f96d53..785fc21a6 100644 --- a/workflows/mag.nf +++ b/workflows/mag.nf @@ -199,22 +199,6 @@ workflow MAG { ch_assemblies = ch_assemblies.mix(ch_assemblies_split.ungzip, GUNZIP_ASSEMBLYINPUT.out.gunzip) ch_shortread_assemblies = ch_assemblies.filter { it[0].assembler.toUpperCase() in ['SPADES', 'SPADESHYBRID', 'MEGAHIT'] } ch_longread_assemblies = ch_assemblies.filter { it[0].assembler.toUpperCase() in ['FLYE', 'METAMDBG'] } - - if(!params.skip_ale) { - // Create the pair list of read-assembl for ale - ch_assembly_mapping_pairs = ch_short_reads - .join(ch_shortread_assemblies) - .map { reads_tuple, assembly_tuple -> - def meta = reads_tuple[0] - def reads = reads_tuple[1] - def assembly_meta = assembly_tuple[0] - def assembly_file = assembly_tuple[1] - [[meta: meta, assembler: assembly_meta.assembler], reads, assembly_file] - } - - ALE(ch_assembly_mapping_pairs) - ch_versions = ch_versions.mix(ALE.out.versions) - } } if (!params.skip_quast) { @@ -516,6 +500,50 @@ workflow MAG { } } + /* + ================================================================================ + ALE Analysis + ================================================================================ + */ + + if(!params.skip_ale) { + if ( !params.skip_binning || params.ancient_dna) { + ch_shortread_assemblies_for_ale = ch_assemblies.filter { meta, assembly -> + meta.assembler?.toUpperCase() in ['SPADES', 'SPADESHYBRID', 'MEGAHIT'] + } + + ch_ale_input = BINNING_PREPARATION.out.grouped_mappings + .join(ch_shortread_assemblies_for_ale, by: 0) + .map { meta, contigs, bam, bai, assembly -> + def actual_bam = bam instanceof List ? bam[0] : bam + [meta, assembly, actual_bam] + } + + ch_ale_input.view { "ALE input: Sample ${it[0].id}, Assembly: ${it[1].name}, BAM: ${it[2].name}" } + + + ALE(ch_ale_input) + ch_versions = ch_versions.mix(ALE.out.versions.ifEmpty([])) + } + else { + log.warn """ + ALE (Assembly Likelihood Estimator) Warning + + ALE is enabled (--skip_ale false) but cannot run because: + - Binning is disabled (--skip_binning true) + - Ancient DNA mode is not enabled (--ancient_dna false) + + To run ALE, choose one of the following options: + + 1. Enable binning: --skip_binning false + 2. Enable ancient DNA: --ancient_dna true + 3. Disable ALE: --skip_ale true + + ALE evaluates assembly quality through read mapping analysis. + """.stripIndent() + } + } + // // Collate and save software versions // From 3779476001749cd7f17c04442b68fee025d14825 Mon Sep 17 00:00:00 2001 From: PetcuBogdan Date: Sat, 15 Nov 2025 20:41:32 +0200 Subject: [PATCH 3/7] minor improvements --- conf/modules.config | 4 ++-- docs/output.md | 7 ++++--- workflows/mag.nf | 7 ++----- 3 files changed, 8 insertions(+), 10 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index ca6096cd2..88634faf8 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -411,8 +411,8 @@ process { publishDir = [ path: { "${params.outdir}/Assembly/${meta.assembler?.toUpperCase() ?: 'UNKNOWN'}/QC/${meta.id}/ALE" }, mode: params.publish_dir_mode, - pattern: "*.{ale.txt,log,ALEoutput.txt,txt}", - saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + pattern: "*.{ale,txt,log}", + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] ext.prefix = { "${meta.id}" } } diff --git a/docs/output.md b/docs/output.md index d952a5470..c3d67f5d0 100644 --- a/docs/output.md +++ b/docs/output.md @@ -303,14 +303,15 @@ SPAdesHybrid is a part of the [SPAdes](http://cab.spbu.ru/software/spades/) soft ### Assembly Quality Control with ALE -[ALE (Assembly Likelihood Estimator)](https://github.com/sc932/ALE) evaluates assembly quality by computing the likelihood of the sequencing reads given an assembly. ALE provides per-contig quality scores and identifies potentially problematic regions in the assembly by analyzing read mapping patterns and insert size distributions. +[ALE (Assembly Likelihood Estimator)](https://github.com/sc932/ALE) is a probabilistic framework that evaluates assembly quality by computing the likelihood of the sequencing reads given an assembly. ALE provides per-contig quality scores and identifies potentially problematic regions in assemblies by analyzing read mapping patterns and insert size distributions. It is particularly useful for comparing assemblies and identifying misassemblies or low-confidence regions. + +ALE is run on short-read assemblies (SPAdes, SPAdes hybrid, and MEGAHIT) when binning or ancient DNA analysis is enabled.
Output files - `Assembly/[assembler]/QC/[sample/group]/ALE/` - - `[sample].ale.txt`: Per-contig ALE scores and quality metrics, including likelihood estimates for each contig - - `[sample].summary.tsv`: Summary statistics of assembly quality assessment with overall ALE scores and metrics + - `[sample]_ALEoutput.txt`: Per-contig ALE scores and quality metrics, including likelihood estimates for each contig - `[sample].log`: ALE processing log file containing diagnostic information and runtime details
diff --git a/workflows/mag.nf b/workflows/mag.nf index 785fc21a6..0f9b6bcc6 100644 --- a/workflows/mag.nf +++ b/workflows/mag.nf @@ -517,17 +517,14 @@ workflow MAG { .map { meta, contigs, bam, bai, assembly -> def actual_bam = bam instanceof List ? bam[0] : bam [meta, assembly, actual_bam] - } - - ch_ale_input.view { "ALE input: Sample ${it[0].id}, Assembly: ${it[1].name}, BAM: ${it[2].name}" } - + } ALE(ch_ale_input) ch_versions = ch_versions.mix(ALE.out.versions.ifEmpty([])) } else { log.warn """ - ALE (Assembly Likelihood Estimator) Warning + [nf-core/mag] ALE (Assembly Likelihood Estimator) Warnings ALE is enabled (--skip_ale false) but cannot run because: - Binning is disabled (--skip_binning true) From 611182a88dd936f02e82638a878b4a3e7d51a58b Mon Sep 17 00:00:00 2001 From: PetcuBogdan Date: Sun, 16 Nov 2025 12:09:06 +0200 Subject: [PATCH 4/7] update changelog.md and usage.md --- CHANGELOG.md | 6 ++++++ docs/usage.md | 16 ++++++++++++++++ 2 files changed, 22 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index e39c9d1ee..6ae0249a8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -41,6 +41,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - [#873](https://github.com/nf-core/mag/pull/873) - Document usage of `longread_percentidentity` and `shortread_percentidentity` and set the value of `longread_percentidentity` in the `test_full` profile to 85 (by @prototaxites) - [#875](https://github.com/nf-core/mag/pull/875) - Add binner COMEBin (by @d4straub) +- [#931](https://github.com/nf-core/mag/pull/931) - Added ALE (Assembly Likelihood Estimator) for probabilistic assembly quality control (by @PetcuBogdan) + - ALE provides per-contig quality scores for short-read assemblies (SPAdes, MEGAHIT) + - Runs automatically when binning is enabled (default behavior) + - Output: `Assembly/[assembler]/QC/[sample]/ALE/` + - Can be disabled with `--skip_ale` parameter + ### `Changed` - [#878](https://github.com/nf-core/mag/pull/878) - Refine test_full config with optimised resource usage for AWS release megatests (by @jfy133) diff --git a/docs/usage.md b/docs/usage.md index ed0845cb2..36f6e065f 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -454,6 +454,22 @@ This can also remove 'nonsense' bins of e.g. a single or a collection of very sh Note that in this context, it is recommended to also set `--min_length_unbinned_contigs` to a suitably high value that corresponds to a reasonable bin size if the `-bin_*_length` parameters are used, so you have useful 'singular' contigs in the unbinned output. +## A note on assembly quality control with ALE + +## A note on assembly quality control with ALE + +The pipeline uses [ALE (Assembly Likelihood Estimator)](https://github.com/sc932/ALE) to perform probabilistic quality assessment of short-read assemblies generated by MEGAHIT and SPAdes. + +ALE evaluates assembly quality by computing the likelihood that the assembly could have generated the observed sequencing reads. Unlike traditional assembly QC tools that rely on reference genomes or marker genes, ALE provides a reference-free quality assessment that is particularly useful for novel organisms or complex metagenomes where references may not be available. + +ALE runs automatically when binning is enabled (default behavior), short reads are provided, and assemblies are generated with MEGAHIT or SPAdes. The tool generates quality assessment files in `Assembly/[assembler]/QC/[sample]/ALE/` containing per-assembly likelihood metrics (`[sample]_ALEoutput.txt`). + +ALE scores are log-likelihoods where higher (less negative) values indicate better assembly quality. These scores reflect how well the assembly explains the observed sequencing reads and can help identify assemblies that may have structural issues or errors that could affect downstream binning and annotation. + +If you wish to skip the ALE quality assessment step (for example, to speed up the pipeline when working with well-characterized samples), you can disable it with `--skip_ale`. + +Note that ALE only works with short-read assemblies (MEGAHIT, SPAdes). Long-read assemblies (Flye, MetaMDBG) are not supported by ALE, and hybrid assemblies (SPAdesHybrid) use only the short-read component for ALE scoring. For more information about ALE and how to interpret the results, see the [ALE GitHub repository](https://github.com/sc932/ALE) and the [publication](https://doi.org/10.1093/bioinformatics/bts723). + ## A note on GTDB having too many files or using too many inodes The GTDB is very large both in size and by the number of files it contains. From d0a489de26d302839199503d758d2652c0f89332 Mon Sep 17 00:00:00 2001 From: nf-core-bot Date: Sun, 16 Nov 2025 10:53:14 +0000 Subject: [PATCH 5/7] [automated] Fix code linting --- docs/output.md | 1 - workflows/mag.nf | 16 ++++++++-------- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/docs/output.md b/docs/output.md index 572c5a217..255ae527a 100644 --- a/docs/output.md +++ b/docs/output.md @@ -206,7 +206,6 @@ Trimmed (short) reads are assembled with both megahit and SPAdes. Hybrid assembl - `MEGAHIT-[sample/group]-[sampleToMap].bowtie2.log`: Bowtie2 log file indicating how many reads have been mapped from the respective sample ("sampleToMap"). - `MEGAHIT-[sample].[bam/bai]`: Optionally saved BAM file of the Bowtie2 mapping of reads against the assembly. - ### SPAdes diff --git a/workflows/mag.nf b/workflows/mag.nf index aeca127e8..fc760c852 100644 --- a/workflows/mag.nf +++ b/workflows/mag.nf @@ -514,31 +514,31 @@ workflow MAG { ch_shortread_assemblies_for_ale = ch_assemblies.filter { meta, assembly -> meta.assembler?.toUpperCase() in ['SPADES', 'SPADESHYBRID', 'MEGAHIT'] } - + ch_ale_input = BINNING_PREPARATION.out.grouped_mappings .join(ch_shortread_assemblies_for_ale, by: 0) .map { meta, contigs, bam, bai, assembly -> def actual_bam = bam instanceof List ? bam[0] : bam [meta, assembly, actual_bam] - } + } ALE(ch_ale_input) - ch_versions = ch_versions.mix(ALE.out.versions.ifEmpty([])) + ch_versions = ch_versions.mix(ALE.out.versions.ifEmpty([])) } else { log.warn """ [nf-core/mag] ALE (Assembly Likelihood Estimator) Warnings - + ALE is enabled (--skip_ale false) but cannot run because: - Binning is disabled (--skip_binning true) - Ancient DNA mode is not enabled (--ancient_dna false) - + To run ALE, choose one of the following options: - + 1. Enable binning: --skip_binning false - 2. Enable ancient DNA: --ancient_dna true + 2. Enable ancient DNA: --ancient_dna true 3. Disable ALE: --skip_ale true - + ALE evaluates assembly quality through read mapping analysis. """.stripIndent() } From fd550b71962b57f43241f5b4feef447ec6a1ab0e Mon Sep 17 00:00:00 2001 From: PetcuBogdan <93000611+PetcuBogdan@users.noreply.github.com> Date: Tue, 18 Nov 2025 20:39:26 +0200 Subject: [PATCH 6/7] fix duplicate from usage.md Co-authored-by: Daniel Straub <42973691+d4straub@users.noreply.github.com> --- docs/usage.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/docs/usage.md b/docs/usage.md index 36f6e065f..7d7140b0a 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -456,8 +456,6 @@ Note that in this context, it is recommended to also set `--min_length_unbinned_ ## A note on assembly quality control with ALE -## A note on assembly quality control with ALE - The pipeline uses [ALE (Assembly Likelihood Estimator)](https://github.com/sc932/ALE) to perform probabilistic quality assessment of short-read assemblies generated by MEGAHIT and SPAdes. ALE evaluates assembly quality by computing the likelihood that the assembly could have generated the observed sequencing reads. Unlike traditional assembly QC tools that rely on reference genomes or marker genes, ALE provides a reference-free quality assessment that is particularly useful for novel organisms or complex metagenomes where references may not be available. From ce4298cc80491d523a49eaa10efeb97469fa67b8 Mon Sep 17 00:00:00 2001 From: PetcuBogdan <93000611+PetcuBogdan@users.noreply.github.com> Date: Tue, 18 Nov 2025 20:41:26 +0200 Subject: [PATCH 7/7] fix misspelled from nextflow_schema Co-authored-by: Daniel Straub <42973691+d4straub@users.noreply.github.com> --- nextflow_schema.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index 736d2feab..7bb9d8398 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -631,7 +631,7 @@ }, "skip_ale": { "type": "boolean", - "descrition": "Skip ALE" + "description": "Skip ALE" }, "skip_quast": { "type": "boolean",