Merge branch 'CW-4947-maintenance' into 'dev'

nggvs · nggvs · commit c315435d43f5 · 2025-04-11T16:18:40.000Z
Bump version [CW-4947]

See merge request epi2melabs/workflows/wf-metagenomics!237
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,7 +5,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
 
-## [Unreleased]
+## [v2.13.0]
 ### Changed
 - Update Seqkit version(>2.6.0) so Abricate can handle bgzip files without decompressing.
 - Split taxonomy classification from minimap2 alignment process to make it more modular.
diff --git a/modules/local/common.nf b/modules/local/common.nf
@@ -127,23 +127,20 @@ process createAbundanceTables {
 /* Extract reads in FASTQ from a list of IDs.
 Use for example to output the unclassified reads.
  */
-process extractReads {
+process publishReads {
     label "wfmetagenomics"
     publishDir "${params.out_dir}/${output_name}", mode: 'copy', pattern: "*.${output_name}.fq.gz", enabled: params.output_unclassified
     tag "${meta.alias}"
     cpus 1
     memory 4.GB
     input:
-        tuple val(meta), path(concat_seqs), path("ids.txt")
+        tuple val(meta), path("reads.fq.gz"), path("ids.txt")
         val output_name
     output:
         path "${meta.alias}.${output_name}.fq.gz"
-    // No output, can publish results in the process?
-    // At this moment, input sequences are o FASTQ
-    // or BAM with the return_fastq option enable
     script:
         """
-        seqkit grep --pattern-file ids.txt "${concat_seqs}" -o "${meta.alias}.${output_name}.fq.gz"
+        seqkit grep --pattern-file ids.txt reads.fq.gz -o "${meta.alias}.${output_name}.fq.gz"
         """
 }
 
diff --git a/nextflow.config b/nextflow.config
@@ -135,7 +135,7 @@ manifest {
     description     = 'Identification of the origin of single reads from both amplicon-targeted and shotgun metagenomics sequencing.'
     mainScript      = 'main.nf'
     nextflowVersion = '>=23.04.2'
-    version         = 'v2.12.1'
+    version         = 'v2.13.0'
 }
 
 
diff --git a/subworkflows/kraken_pipeline.nf b/subworkflows/kraken_pipeline.nf
@@ -5,7 +5,6 @@ include {
     run_common;
     createAbundanceTables;
     publish;
-    extractReads;
 } from "../modules/local/common"
 
 
@@ -16,6 +15,10 @@ process run_kraken2 {
     label 'wfmetagenomics'
     tag "${meta.alias}"
     publishDir "${params.out_dir}/kraken2", mode: 'copy', pattern: "*kraken2.report.txt*"
+    publishDir (
+        "${params.out_dir}/unclassified", mode: 'copy',
+        pattern: "${meta.alias}.unclassified.fq.gz", enabled: params.output_unclassified
+    )
     cpus params.threads
     // Set the memory required to the size of the database + 4GB overhead.
     memory {
@@ -26,13 +29,18 @@ process run_kraken2 {
         }
     }
     errorStrategy {
-        task.exitStatus == 137 ? log.error("Error 137 may indicate the process ran out of memory.\nIf you are using Docker you should check the amount of RAM allocated to your Docker server.") : ''
+        task.exitStatus == 137 ? log.error(
+            '''
+            Error 137 may indicate the process ran out of memory.
+            If you are using Docker you should check the amount of 
+            RAM allocated to your Docker server.
+            '''.stripIndent()) : ''
         log.error("Consider to use --kraken2_memory_mapping to reduce the use of RAM memory.")
     }
     input:
         tuple(
             val(meta),
-            path(concat_seqs),
+            path("reads.fq.gz"),
             path(fastq_stats)
         )
         path kraken_db
@@ -44,22 +52,25 @@ process run_kraken2 {
             path("${meta.alias}.kraken2.assignments.tsv"),
             emit: kraken2_reports
         )
-        tuple (
+        tuple(
             val(meta),
-            path("${meta.alias}.unclassified.txt"),
-            emit: unclassified_ids
+            path("${meta.alias}.unclassified.fq.gz"),
+            emit: kraken2_unclassified, optional:true
         )
     script:
         def sample_id = "${meta.alias}"
         def memory_mapping = params.kraken2_memory_mapping ? '--memory-mapping' : ''
+        def unclassified_tmp = "${meta.alias}.unclassified.fq"
+        def output_unclassified = params.output_unclassified ? '--unclassified-out ' + unclassified_tmp: ''
     """
-    kraken2 --db ${kraken_db} ${concat_seqs} \
+    kraken2 --db ${kraken_db} reads.fq.gz \
         --threads $task.cpus \
         --report "${sample_id}.kraken2.report.txt" \
-        --confidence ${params.kraken2_confidence} ${memory_mapping} > "${sample_id}.kraken2.assignments.tsv"
-    # Recover unclassified IDs
-    csvtk filter2 --no-header-row --tabs -f '\$1=="U"' "${sample_id}.kraken2.assignments.tsv" \
-        | cut -f2 > "${meta.alias}.unclassified.txt"
+        --confidence ${params.kraken2_confidence} ${memory_mapping} \
+        $output_unclassified > "${sample_id}.kraken2.assignments.tsv"
+    if [ -f $unclassified_tmp ]; then
+        bgzip "${meta.alias}.unclassified.fq"
+    fi
     """
 }
 
@@ -214,16 +225,6 @@ workflow kraken_pipeline {
         // Find out size of the db. Cannot be done within the process
         database_main_file_size = database.resolve('hash.k2d').size()
         kraken2_reports = run_kraken2(samples, database, database_main_file_size)
-        // Output unclassified
-        if (params.output_unclassified) {
-            unclassified_to_extract = samples.join(kraken2_reports.unclassified_ids
-                )
-                | map { meta, seqs, stats, unclassified_ids ->
-                    [meta, seqs, unclassified_ids]
-                }
-            extractReads(unclassified_to_extract, "unclassified")
-        }
-
         // Run bracken
         bracken_reports = run_bracken(kraken2_reports.kraken2_reports, database, taxonomy, bracken_length, taxonomic_rank)
         lineages = bracken_reports.bracken_json
diff --git a/subworkflows/minimap_pipeline.nf b/subworkflows/minimap_pipeline.nf
@@ -7,7 +7,7 @@ include {
     run_common;
     createAbundanceTables;
     publish;
-    extractReads;
+    publishReads;
 } from "../modules/local/common"
 
 OPTIONAL_FILE = file("$projectDir/data/OPTIONAL_FILE")
@@ -312,7 +312,7 @@ workflow minimap_pipeline {
                 ).map { meta, seqs, stats, unclassified_ids ->
                         [meta, seqs, unclassified_ids]
                 }
-            extractReads(unclassified_to_extract, "unclassified")
+            publishReads(unclassified_to_extract, "unclassified")
         }
         // Use initial reads stats (after fastcat) QC, but update meta
         for_report = samples

Original file line number	Diff line number	Diff line change
`@@ -135,7 +135,7 @@ manifest {`
`135`	`135`	`description = 'Identification of the origin of single reads from both amplicon-targeted and shotgun metagenomics sequencing.'`
`136`	`136`	`mainScript = 'main.nf'`
`137`	`137`	`nextflowVersion = '>=23.04.2'`
`138`		`- version = 'v2.12.1'`
	`138`	`+ version = 'v2.13.0'`
`139`	`139`	`}`
`140`	`140`
`141`	`141`