make sure version information is collected from all proceses

Zachary Foster · Zachary Foster · commit cb316ffe3b60 · 2025-05-14T11:49:46.000-07:00
diff --git a/assets/main_report/.gitignore b/assets/main_report/.gitignore
@@ -0,0 +1 @@
+/.quarto/
diff --git a/assets/main_report/index.qmd b/assets/main_report/index.qmd
@@ -1,7 +1,7 @@
 ---
 title: Pathogen Surveillance Report
 params:
-    inputs: ""
+    inputs: "~/downloads/_no_group_defined__inputs"
 execute:
     echo: false
 bibliography: references.bib
diff --git a/conf/modules.config b/conf/modules.config
@@ -203,7 +203,7 @@ process {
         ext.args   = { secrets.NCBI_API_KEY ? "--api-key ${secrets.NCBI_API_KEY}" : "" }
 
         // Settings to avoid API rate limits and not put too much stress on servers
-        maxForks = 1 // NCBI seems to be not allowing concurrent downloads with this command, althogh I cannot find any documentation about this
+        maxForks = 1 // NCBI seems to be not allowing concurrent downloads with this command, although I cannot find any documentation about this
         errorStrategy = { task.attempt >= 3 ? 'ignore' : 'retry' }
         beforeScript = { task.attempt == 1 ? 'sleep "0.$(($RANDOM % 100))"' : "sleep ${Math.pow(5, task.attempt)}" }
     }
@@ -212,6 +212,7 @@ process {
         publishDir = [
             enabled: false
         ]
+        ext.args   = { "--dosage 1000000 --full-annot -n --align-off" }
     }
 
     withName: FASTP {
@@ -229,7 +230,10 @@ process {
         cpus       = { 2     * task.attempt }
         memory     = { 8.GB  * task.attempt }
         time       = { 12.h  * task.attempt }
-        ext.args   = { params.temp_dir ? "--dir ${params.temp_dir} --quiet" : "--quiet" }
+        ext.args   = { [
+            params.temp_dir ? "--dir ${params.temp_dir}" : "",
+            "--quiet"
+        ].minus("").join(" ") }
     }
 
     withName: FILTER_ASSEMBLY {
@@ -251,7 +255,10 @@ process {
         memory = { 1.GB * task.attempt }
         time   = { 12.h * task.attempt }
         storeDir = { params.data_dir == "false" ? null : "${params.data_dir}/assembly_metadata" }
-        ext.args   = { secrets.NCBI_API_KEY ? "--as-json-lines --api-key ${secrets.NCBI_API_KEY}" : "--as-json-lines" }
+        ext.args   = { [
+            secrets.NCBI_API_KEY ? "--api-key ${secrets.NCBI_API_KEY}": "",
+            "--as-json-lines"
+        ].minus("").join(" ") }
         maxRetries = 2
 
         // Settings to avoid API rate limits and not put too much stress on servers
diff --git a/docs/usage.md b/docs/usage.md
@@ -99,12 +99,12 @@ The reference metadata TSV or the sample metadata TSV can have the following col
 The typical command for running the pipeline is as follows:
 
 ```bash
-nextflow run nf-core/pathogensurveillance -profile <REPLACE WITH RUN TOOL> -resume --input <REPLACE WITH TSV/CSV> --outdir <REPLACE WITH OUTPUT PATH>
+nextflow run nf-core/pathogensurveillance -profile <REPLACE WITH PACKAGE MANAGER> -resume --input <REPLACE WITH TSV/CSV> --outdir <REPLACE WITH OUTPUT PATH>
 ```
 
 Where:
 
-- `<REPLACE WITH RUN TOOL>` is one of docker, singularity, podman, shifter, charliecloud, or conda
+- `<REPLACE WITH PACKAGE MANAGER>` is one of docker, singularity, podman, shifter, charliecloud, or conda
 - `<REPLACE WITH TSV/CSV>` is the path to the input samplesheet
 - `<REPLACE WITH OUTPUT PATH>` is the path to where to save the output
 
diff --git a/modules/local/download_assemblies/Dockerfile b/modules/local/download_assemblies/Dockerfile
@@ -1,4 +1,4 @@
-# Dockerfile to create container with Cell Ranger v8.0.0 and bcl2fastq v2.20.0
+# Dockerfile to create container with ncbi-datasets-cli=16.0.0 and unzip and bioconda::samtools=1.18
 # Push to nfcore/ncbi-datasets-cli:<VER>
 
 FROM condaforge/mambaforge:23.1.0-4
diff --git a/modules/local/extract_feature_sequences/main.nf b/modules/local/extract_feature_sequences/main.nf
@@ -18,10 +18,10 @@ process EXTRACT_FEATURE_SEQUENCES {
     task.ext.when == null || task.ext.when
 
     script:
+    def args = task.ext.args ?: ''
     prefix = task.ext.prefix ?: "${ref_meta.id}"
     """
-    # Set dosage ultra high to include all high copy sequences per strain, just get everything
-    align_feature_sequences_mod.pl --dosage 1000000 --full-annot -i PIRATE.gene_families.ordered.tsv -g modified_gffs/ -o ${prefix}_feature_sequences/ -p ${task.cpus} -n --align-off
+    align_feature_sequences_mod.pl ${args} -i PIRATE.gene_families.ordered.tsv -g modified_gffs/ -o ${prefix}_feature_sequences/ -p ${task.cpus}
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":
diff --git a/modules/local/filter_assembly/main.nf b/modules/local/filter_assembly/main.nf
@@ -25,6 +25,8 @@ process FILTER_ASSEMBLY {
         $args \\
         ${prefix}_unzipped.fasta > ${prefix}_filtered.fasta
 
+    rm ${prefix}_unzipped.fasta
+
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":
         biopython: \$(python -c "import Bio; print(Bio.__version__)")
diff --git a/modules/local/graphtyper/genotype/main.nf b/modules/local/graphtyper/genotype/main.nf
@@ -55,10 +55,15 @@ process GRAPHTYPER_GENOTYPE {
 
     # Move result files into working directory for output
     find results -maxdepth 2 -name '*.vcf*' > output_paths.txt
-    sed 's_results/__g' output_paths.txt | sed 's|/|-|g' > output_names.txt
+    sed -e 's|results/||g' -e sed 's|/|-|g' > output_names.txt
     paste -d ' ' output_paths.txt output_names.txt | xargs -I {} echo "mv {}" > mv_commands.sh
     source mv_commands.sh
 
+    # Clean up
+    if [[ $ref =~ \\.gz\$ ]]; then
+       rm __my__reference__.fasta
+    fi
+
     # Save version information for graphtyper
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":
diff --git a/modules/local/main_report/main.nf b/modules/local/main_report/main.nf
@@ -37,7 +37,9 @@ process MAIN_REPORT {
 
     # Rename outputs
     mv main_report/${prefix}_report/index.html ${prefix}_pathsurveil_report.html
-    #mv main_report/${prefix}_report/index.pdf ${prefix}_pathsurveil_report.pdf
+
+    # Clean up
+    rm -r main_report
 
     # Save version of quarto used
     cat <<-END_VERSIONS > versions.yml
diff --git a/modules/local/make_gff_with_fasta/main.nf b/modules/local/make_gff_with_fasta/main.nf
@@ -12,6 +12,7 @@ process MAKE_GFF_WITH_FASTA {
 
     output:
     tuple val(meta), path("${prefix}.gff"), emit: gff
+    path "versions.yml"                   , emit: versions
 
     when:
     task.ext.when == null || task.ext.when
@@ -39,5 +40,10 @@ process MAKE_GFF_WITH_FASTA {
     # Rename output file to be just the sample ID and make sure input file does not have same name
     mv ${gff} input_${gff}
     mv ${prefix}_with_ref.gff ${prefix}.gff
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        sed: \$(sed --version | head -n 1 | sed 's/sed (GNU sed) //')
+    END_VERSIONS
     """
 }
diff --git a/modules/local/picard_format/main.nf b/modules/local/picard_format/main.nf
@@ -1,4 +1,3 @@
-
 process PICARD_FORMAT {
     tag "$meta.id"
     label 'process_low'
diff --git a/modules/local/reformat_pirate_results/main.nf b/modules/local/reformat_pirate_results/main.nf
@@ -32,6 +32,9 @@ process REFORMAT_PIRATE_RESULTS {
     # gene/allele presence-absence
     PIRATE_to_Rtab.pl -i ${prefix}_gene_family.tsv -o ${prefix}_genePA.tsv
 
+    # Clean up
+    rm -r reformatted_gffs
+
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":
         pirate: \$( echo \$( PIRATE --version 2>&1) | sed 's/PIRATE //' )
diff --git a/subworkflows/local/align_reads/main.nf b/subworkflows/local/align_reads/main.nf
@@ -41,6 +41,7 @@ workflow ALIGN_READS {
         .join(ch_reference)
         .join(ch_ref_index)
     PICARD_FORMAT ( picard_input )
+    versions = versions.mix(PICARD_FORMAT.out.versions)
 
     SAMTOOLS_INDEX ( PICARD_FORMAT.out.bam )
     versions = versions.mix(SAMTOOLS_INDEX.out.versions)
diff --git a/subworkflows/local/core_genome_phylogeny/main.nf b/subworkflows/local/core_genome_phylogeny/main.nf
@@ -46,6 +46,7 @@ workflow CORE_GENOME_PHYLOGENY {
         params.n_ref_closest_named,
         params.n_ref_context
     )
+    versions = versions.mix(ASSIGN_CORE_REFERENCES.out.versions)
 
     // Get relevant information from all references assigned to samples
     all_ref_data =  sample_data
@@ -119,6 +120,7 @@ workflow CORE_GENOME_PHYLOGENY {
                 [ref_meta, ref_path, ref_gff]
             }
     )
+    versions = versions.mix(MAKE_GFF_WITH_FASTA.out.versions)
 
     // group samples by report group
     bakta_gffs = all_assem_data
@@ -159,6 +161,7 @@ workflow CORE_GENOME_PHYLOGENY {
     CALCULATE_POCP (
         REFORMAT_PIRATE_RESULTS.out.gene_fam_pa
     )
+    versions = versions.mix(CALCULATE_POCP.out.versions)
 
     // Extract sequences of all genes
     EXTRACT_FEATURE_SEQUENCES ( good_pirate_results )
diff --git a/subworkflows/local/genome_assembly/main.nf b/subworkflows/local/genome_assembly/main.nf
@@ -85,11 +85,13 @@ workflow GENOME_ASSEMBLY {
         filtered_input.nanopore_prokaryote.mix(filtered_input.nanopore_eukaryote),
         "--nano-raw"
     )
+    versions = versions.mix(FLYE_NANOPORE.out.versions)
 
     FLYE_PACBIO (
         filtered_input.pacbio_prokaryote.mix(filtered_input.pacbio_eukaryote),
         "--pacbio-raw"
     )
+    versions = versions.mix(FLYE_PACBIO.out.versions)
 
     FILTER_ASSEMBLY (
         SPADES.out.scaffolds
diff --git a/subworkflows/local/prepare_input/main.nf b/subworkflows/local/prepare_input/main.nf
@@ -25,6 +25,7 @@ workflow PREPARE_INPUT {
 
     // Parse input tables
     SAMPLESHEET_CHECK ( sample_data_tsv, reference_data_tsv, params.max_samples )
+    versions = versions.mix(SAMPLESHEET_CHECK.out.versions)
     sample_data = SAMPLESHEET_CHECK.out.sample_data
         .splitCsv ( header:true, sep:'\t', quote:'"' )
         .map { create_sample_metadata_channel(it) }
@@ -160,6 +161,7 @@ workflow PREPARE_INPUT {
     PARSE_ASSEMBLIES (
         FIND_ASSEMBLIES.out.stats
     )
+    versions = versions.mix(PARSE_ASSEMBLIES.out.versions)
 
     // Add placeholders for NCBI reference metadata if none was looked up
     ncbi_ref_meta = family_taxon_ids
@@ -195,6 +197,7 @@ workflow PREPARE_INPUT {
         params.n_ref_genera,
         params.only_latin_binomial_refs
     )
+    versions = versions.mix(PICK_ASSEMBLIES.out.versions)
 
     // Add placeholders for PICK_ASSEMBLIES output if not run
     picked_assemblies_stat_files = sample_data
@@ -316,6 +319,7 @@ workflow PREPARE_INPUT {
             }
             .unique(),
     )
+    versions = versions.mix(SEQKIT_STATS.out.versions)
     read_count = SEQKIT_STATS.out.stats
         .splitCsv ( header:true, sep:'\t', elem: 1 )
         .map { sample_meta, stats ->
diff --git a/subworkflows/local/variant_analysis/main.nf b/subworkflows/local/variant_analysis/main.nf
@@ -51,6 +51,7 @@ workflow VARIANT_ANALYSIS {
         ani_matrix.join(samp_ref_pairs),
         params.ref_min_ani
     )
+    versions = versions.mix(ASSIGN_MAPPING_REFERENCE.out.versions)
     ref_paths = references
         .map {sample_id, report_group_id, ref_id, ref_name, ref_desc, ref_path, usage ->
             [[id: sample_id], [id:report_group_id], [id: ref_id], ref_path, usage]
@@ -99,6 +100,7 @@ workflow VARIANT_ANALYSIS {
         }
         .unique()
     )
+    versions = versions.mix(SEQKIT_SLIDING.out.versions)
     chopped_reads = SEQKIT_SLIDING.out.fastx
         .combine(longreads, by: 0)
         .map { sample_meta, chopped_reads, report_meta, ref_meta, ref_path, usage, read_paths, sequence_type, ploidy ->

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-# Dockerfile to create container with Cell Ranger v8.0.0 and bcl2fastq v2.20.0`
	`1`	`+# Dockerfile to create container with ncbi-datasets-cli=16.0.0 and unzip and bioconda::samtools=1.18`
`2`	`2`	`# Push to nfcore/ncbi-datasets-cli:<VER>`
`3`	`3`
`4`	`4`	`FROM condaforge/mambaforge:23.1.0-4`