From 75efd153dda376148e45ab751cdc6d37af2c1e0f Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Mon, 4 Aug 2025 15:56:45 +0100 Subject: [PATCH 01/58] Updates to modules --- modules/local/extract/telo/main.nf | 43 ++++++++++++++++ modules/local/gawk/environment.yml | 7 +++ modules/local/gawk/main.nf | 68 +++++++++++++++++++++++++ modules/local/gawk/meta.yml | 63 +++++++++++++++++++++++ modules/local/pretext/graph/main.nf | 77 +++++++++++++++++++++++++---- 5 files changed, 248 insertions(+), 10 deletions(-) create mode 100755 modules/local/extract/telo/main.nf create mode 100644 modules/local/gawk/environment.yml create mode 100644 modules/local/gawk/main.nf create mode 100644 modules/local/gawk/meta.yml diff --git a/modules/local/extract/telo/main.nf b/modules/local/extract/telo/main.nf new file mode 100755 index 00000000..380c1acf --- /dev/null +++ b/modules/local/extract/telo/main.nf @@ -0,0 +1,43 @@ +process EXTRACT_TELO { + tag "${meta.id}" + label 'process_low' + + conda "conda-forge::coreutils=9.1" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : + 'docker.io/ubuntu:20.04' }" + + input: + tuple val( meta ), path( file ) + + output: + tuple val( meta ), file( "*bed" ) , emit: bed + tuple val( meta ), file("*bedgraph"), emit: bedgraph + path "versions.yml" , emit: versions + + script: + def prefix = task.ext.prefix ?: "${meta.id}" + def VERSION = "9.1" // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions. + """ + cat "${file}" | awk '{print \$2"\\t"\$4"\\t"\$5}' | sed 's/>//g' > ${prefix}_telomere.bed + cat "${file}" | awk '{print \$2"\\t"\$4"\\t"\$5"\\t"(((\$5-\$4)<0)?-(\$5-\$4):(\$5-\$4))}' | sed 's/>//g' > ${prefix}_telomere.bedgraph + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + coreutils: $VERSION + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + def VERSION = "9.1" // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions. + """ + touch ${prefix}_telomere.bed + touch ${prefix}_telomere.bedgraph + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + coreutils: $VERSION + END_VERSIONS + """ +} diff --git a/modules/local/gawk/environment.yml b/modules/local/gawk/environment.yml new file mode 100644 index 00000000..f52109e8 --- /dev/null +++ b/modules/local/gawk/environment.yml @@ -0,0 +1,7 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::gawk=5.3.0 diff --git a/modules/local/gawk/main.nf b/modules/local/gawk/main.nf new file mode 100644 index 00000000..f7f34b2e --- /dev/null +++ b/modules/local/gawk/main.nf @@ -0,0 +1,68 @@ +process GAWK { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/gawk:5.3.0' : + 'biocontainers/gawk:5.3.0' }" + + input: + tuple val(meta), path(input, arity: '0..*') + path(program_file) + val(disable_redirect_output) + + output: + tuple val(meta), path("direction.0.${suffix}"), emit: prime3 + tuple val(meta), path("direction.1.${suffix}"), emit: prime5 + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' // args is used for the main arguments of the tool + def args2 = task.ext.args2 ?: '' // args2 is used to specify a program when no program file has been given + prefix = task.ext.prefix ?: "${meta.id}" + suffix = task.ext.suffix ?: "${input.collect{ it.getExtension()}.get(0)}" // use the first extension of the input files + + program = program_file ? "-f ${program_file}" : "${args2}" + lst_gz = input.findResults{ it.getExtension().endsWith("gz") ? it.toString() : null } + unzip = lst_gz ? "gunzip -q -f ${lst_gz.join(" ")}" : "" + input_cmd = input.collect { it.toString() - ~/\.gz$/ }.join(" ") + cleanup = lst_gz ? "rm ${lst_gz.collect{ it - ~/\.gz$/ }.join(" ")}" : "" + + input.collect{ + assert it.name != "${prefix}.${suffix}" : "Input and output names are the same, set prefix in module configuration to disambiguate!" + } + + """ + ${unzip} + + awk \\ + ${args} \\ + ${program} \\ + ${input_cmd} + + ${cleanup} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gawk: \$(awk -Wversion | sed '1!d; s/.*Awk //; s/,.*//') + END_VERSIONS + """ + + stub: + prefix = task.ext.prefix ?: "${meta.id}" + suffix = task.ext.suffix ?: "${input.getExtension()}" + def create_cmd = suffix.endsWith("gz") ? "echo '' | gzip >" : "touch" + + """ + ${create_cmd} ${prefix}.${suffix} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gawk: \$(awk -Wversion | sed '1!d; s/.*Awk //; s/,.*//') + END_VERSIONS + """ +} diff --git a/modules/local/gawk/meta.yml b/modules/local/gawk/meta.yml new file mode 100644 index 00000000..34c50b12 --- /dev/null +++ b/modules/local/gawk/meta.yml @@ -0,0 +1,63 @@ +name: "gawk" +description: | + If you are like many computer users, you would frequently like to make changes in various text files + wherever certain patterns appear, or extract data from parts of certain lines while discarding the rest. + The job is easy with awk, especially the GNU implementation gawk. +keywords: + - gawk + - awk + - txt + - text + - file parsing +tools: + - "gawk": + description: "GNU awk" + homepage: "https://www.gnu.org/software/gawk/" + documentation: "https://www.gnu.org/software/gawk/manual/" + tool_dev_url: "https://www.gnu.org/prep/ftp.html" + licence: ["GPL v3"] + identifier: "" +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - input: + type: file + description: The input file - Specify the logic that needs to be executed on + this file on the `ext.args2` or in the program file. + If the files have a `.gz` extension, they will be unzipped using `zcat`. + pattern: "*" + - - program_file: + type: file + description: Optional file containing logic for awk to execute. If you don't + wish to use a file, you can use `ext.args2` to specify the logic. + pattern: "*" + - - disable_redirect_output: + type: boolean + description: Disable the redirection of awk output to a given file. This is + useful if you want to use awk's built-in redirect to write files instead + of the shell's redirect. +output: + - output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.${suffix}": + type: file + description: The output file - if using shell redirection, specify the name of this + file using `ext.prefix` and the extension using `ext.suffix`. Otherwise, ensure + the awk program produces files with the extension in `ext.suffix`. + pattern: "*" + - versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@nvnieuwk" +maintainers: + - "@nvnieuwk" diff --git a/modules/local/pretext/graph/main.nf b/modules/local/pretext/graph/main.nf index 4e9c92ad..ac966417 100644 --- a/modules/local/pretext/graph/main.nf +++ b/modules/local/pretext/graph/main.nf @@ -5,11 +5,12 @@ process PRETEXT_GRAPH { container "quay.io/sanger-tol/pretext:0.0.9-yy5-c2" input: - tuple val(meta), path(pretext_file) + tuple val(meta), path(pretext_file) path(gap_file, stageAs: 'gap_file.bed') path(coverage, stageAs: 'coverage.bw') - path(telomere_file, stageAs: 'telomere.bed') + path(telomere_file, stageAs: 'telomere/*') path(repeat_density, stageAs: 'repeat_density.bw') + val(split_telo_bool) output: tuple val(meta), path("*.pretext") , emit: pretext @@ -30,7 +31,6 @@ process PRETEXT_GRAPH { // Using single [ ] as nextflow will use sh where possible not bash """ - echo "PROCESSING ESSENTIAL FILES" if [ -s "${coverage}" ]; then @@ -50,20 +50,78 @@ process PRETEXT_GRAPH { fi echo "NOW PROCESSING NON-ESSENTIAL files" - input_file="repeat.pretext.part" - if [ -s "${gap_file}" ]; then echo "Processing GAP file..." cat "${gap_file}" | PretextGraph ${args} -i repeat.pretext.part -n "gap" -o gap.pretext.part input_file="gap.pretext.part" fi - if [ -s "${telomere_file}" ]; then - echo "Processing TELO file..." - cat "${telomere_file}" | PretextGraph ${args} -i "\$input_file" -n "telomere" -o "${prefix}.pretext" + # Check if telomere directory has any files + if [ "\$(ls -A telomere 2>/dev/null)" ]; then + file_telox="" + file_5p="" + file_3p="" + file_og="" + + for file in telomere/*.bedgraph; do + [ -e "\$file" ] || continue # skip if no match + fname=\$(basename "\$file") + + case "\$fname" in + *telox*) + echo + file_telox="\$file" + ;; + *5P*) + file_5p="\$file" + ;; + *3P*) + file_3p="\$file" + ;; + *) + file_og="\$file" + ;; + esac + done + + ls telomere/* + echo \$file_og + + if [ -s "\$file_og" ]; then + echo "Processing OG_TELOMERE file..." + PretextGraph $args -i "\$input_file" -n "og_telomere" -o telo_0.pretext < "\$file_og" + else + echo "No OG TELOMERE file" + cp "\$input_file" telo_0.pretext + fi + + if [ -s "\$file_telox" ]; then + echo "Processing TELOX_TELOMERE file..." + PretextGraph $args -i telo_0.pretext -n "telox_telomere" -o telo_1.pretext < "\$file_telox" + else + echo "No TELOX file" + cp telo_0.pretext telo_1.pretext + fi + + if [ -s "\$file_5p" ]; then + echo "Processing 5 Prime TELOMERE file..." + PretextGraph $args -i telo_1.pretext -n "5p_telomere" -o telo_2.pretext < "\$file_5p" + else + echo "No 5Prime TELOMERE file" + cp telo_1.pretext telo_2.pretext + fi + + if [ -s "\$file_3p" ]; then + echo "Processing 3 Prime TELOMERE file..." + PretextGraph $args -i telo_2.pretext -n "3p_telomere" -o "${prefix}.pretext" < "\$file_3p" + else + echo "No 3Prime TELOMERE file" + cp telo_2.pretext "${prefix}.pretext" + fi + else - mv "\$input_file" "${prefix}.pretext" + cp "\$input_file" "${prefix}.pretext" fi cat <<-END_VERSIONS > versions.yml @@ -84,7 +142,6 @@ process PRETEXT_GRAPH { def UCSC_VERSION = '448' // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions. """ touch ${prefix}.pretext - cat <<-END_VERSIONS > versions.yml "${task.process}": PretextGraph: \$(PretextGraph | grep "Version" | sed 's/Pretext* Version //;') From 4c07238e253e4754aa45be512a8aa2aaeedf4c01 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Mon, 4 Aug 2025 15:57:54 +0100 Subject: [PATCH 02/58] Update modules --- modules/nf-core/cat/cat/environment.yml | 7 + modules/nf-core/cat/cat/main.nf | 78 +++++++ modules/nf-core/cat/cat/meta.yml | 46 ++++ modules/nf-core/cat/cat/tests/main.nf.test | 191 ++++++++++++++++ .../nf-core/cat/cat/tests/main.nf.test.snap | 147 +++++++++++++ .../cat/tests/nextflow_unzipped_zipped.config | 6 + .../cat/tests/nextflow_zipped_unzipped.config | 8 + .../nf-core/tabix/bgziptabix/environment.yml | 8 + modules/nf-core/tabix/bgziptabix/main.nf | 48 ++++ modules/nf-core/tabix/bgziptabix/meta.yml | 74 +++++++ .../tabix/bgziptabix/tests/main.nf.test | 123 +++++++++++ .../tabix/bgziptabix/tests/main.nf.test.snap | 206 ++++++++++++++++++ .../tabix/bgziptabix/tests/tabix_csi.config | 5 + .../tabix/bgziptabix/tests/tabix_tbi.config | 5 + 14 files changed, 952 insertions(+) create mode 100644 modules/nf-core/cat/cat/environment.yml create mode 100644 modules/nf-core/cat/cat/main.nf create mode 100644 modules/nf-core/cat/cat/meta.yml create mode 100644 modules/nf-core/cat/cat/tests/main.nf.test create mode 100644 modules/nf-core/cat/cat/tests/main.nf.test.snap create mode 100644 modules/nf-core/cat/cat/tests/nextflow_unzipped_zipped.config create mode 100644 modules/nf-core/cat/cat/tests/nextflow_zipped_unzipped.config create mode 100644 modules/nf-core/tabix/bgziptabix/environment.yml create mode 100644 modules/nf-core/tabix/bgziptabix/main.nf create mode 100644 modules/nf-core/tabix/bgziptabix/meta.yml create mode 100644 modules/nf-core/tabix/bgziptabix/tests/main.nf.test create mode 100644 modules/nf-core/tabix/bgziptabix/tests/main.nf.test.snap create mode 100644 modules/nf-core/tabix/bgziptabix/tests/tabix_csi.config create mode 100644 modules/nf-core/tabix/bgziptabix/tests/tabix_tbi.config diff --git a/modules/nf-core/cat/cat/environment.yml b/modules/nf-core/cat/cat/environment.yml new file mode 100644 index 00000000..50c2059a --- /dev/null +++ b/modules/nf-core/cat/cat/environment.yml @@ -0,0 +1,7 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::pigz=2.3.4 diff --git a/modules/nf-core/cat/cat/main.nf b/modules/nf-core/cat/cat/main.nf new file mode 100644 index 00000000..2862c64c --- /dev/null +++ b/modules/nf-core/cat/cat/main.nf @@ -0,0 +1,78 @@ +process CAT_CAT { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/pigz:2.3.4' : + 'biocontainers/pigz:2.3.4' }" + + input: + tuple val(meta), path(files_in) + + output: + tuple val(meta), path("${prefix}"), emit: file_out + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def args2 = task.ext.args2 ?: '' + def file_list = files_in.collect { it.toString() } + + // choose appropriate concatenation tool depending on input and output format + + // | input | output | command1 | command2 | + // |-----------|------------|----------|----------| + // | gzipped | gzipped | cat | | + // | ungzipped | ungzipped | cat | | + // | gzipped | ungzipped | zcat | | + // | ungzipped | gzipped | cat | pigz | + + // Use input file ending as default + prefix = task.ext.prefix ?: "${meta.id}${getFileSuffix(file_list[0])}" + out_zip = prefix.endsWith('.gz') + in_zip = file_list[0].endsWith('.gz') + command1 = (in_zip && !out_zip) ? 'zcat' : 'cat' + command2 = (!in_zip && out_zip) ? "| pigz -c -p $task.cpus $args2" : '' + if(file_list.contains(prefix.trim())) { + error "The name of the input file can't be the same as for the output prefix in the " + + "module CAT_CAT (currently `$prefix`). Please choose a different one." + } + """ + $command1 \\ + $args \\ + ${file_list.join(' ')} \\ + $command2 \\ + > ${prefix} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + pigz: \$( pigz --version 2>&1 | sed 's/pigz //g' ) + END_VERSIONS + """ + + stub: + def file_list = files_in.collect { it.toString() } + prefix = task.ext.prefix ?: "${meta.id}${file_list[0].substring(file_list[0].lastIndexOf('.'))}" + if(file_list.contains(prefix.trim())) { + error "The name of the input file can't be the same as for the output prefix in the " + + "module CAT_CAT (currently `$prefix`). Please choose a different one." + } + """ + touch $prefix + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + pigz: \$( pigz --version 2>&1 | sed 's/pigz //g' ) + END_VERSIONS + """ +} + +// for .gz files also include the second to last extension if it is present. E.g., .fasta.gz +def getFileSuffix(filename) { + def match = filename =~ /^.*?((\.\w{1,5})?(\.\w{1,5}\.gz$))/ + return match ? match[0][1] : filename.substring(filename.lastIndexOf('.')) +} diff --git a/modules/nf-core/cat/cat/meta.yml b/modules/nf-core/cat/cat/meta.yml new file mode 100644 index 00000000..2a9284d7 --- /dev/null +++ b/modules/nf-core/cat/cat/meta.yml @@ -0,0 +1,46 @@ +name: cat_cat +description: A module for concatenation of gzipped or uncompressed files +keywords: + - concatenate + - gzip + - cat +tools: + - cat: + description: Just concatenation + documentation: https://man7.org/linux/man-pages/man1/cat.1.html + licence: ["GPL-3.0-or-later"] + identifier: "" +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - files_in: + type: file + description: List of compressed / uncompressed files + pattern: "*" + ontologies: [] +output: + file_out: + - - meta: + type: map + description: Groovy Map containing sample information + - ${prefix}: + type: file + description: Concatenated file. Will be gzipped if file_out ends with ".gz" + pattern: "${file_out}" + ontologies: [] + versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" + ontologies: + - edam: http://edamontology.org/format_3750 # YAML +authors: + - "@erikrikarddaniel" + - "@FriederikeHanssen" +maintainers: + - "@erikrikarddaniel" + - "@FriederikeHanssen" diff --git a/modules/nf-core/cat/cat/tests/main.nf.test b/modules/nf-core/cat/cat/tests/main.nf.test new file mode 100644 index 00000000..9cb16178 --- /dev/null +++ b/modules/nf-core/cat/cat/tests/main.nf.test @@ -0,0 +1,191 @@ +nextflow_process { + + name "Test Process CAT_CAT" + script "../main.nf" + process "CAT_CAT" + tag "modules" + tag "modules_nfcore" + tag "cat" + tag "cat/cat" + + test("test_cat_name_conflict") { + when { + params { + outdir = "${outputDir}" + } + process { + """ + input[0] = + [ + [ id:'genome', single_end:true ], + [ + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.sizes', checkIfExists: true) + ] + ] + """ + } + } + then { + assertAll( + { assert !process.success }, + { assert process.stdout.toString().contains("The name of the input file can't be the same as for the output prefix") }, + { assert snapshot(process.out.versions).match() } + ) + } + } + + test("test_cat_unzipped_unzipped") { + when { + params { + outdir = "${outputDir}" + } + process { + """ + input[0] = + [ + [ id:'test', single_end:true ], + [ + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.sizes', checkIfExists: true) + ] + ] + """ + } + } + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + + + test("test_cat_zipped_zipped") { + when { + params { + outdir = "${outputDir}" + } + process { + """ + input[0] = + [ + [ id:'test', single_end:true ], + [ + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.gff3.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/alignment/last/contigs.genome.maf.gz', checkIfExists: true) + ] + ] + """ + } + } + then { + def lines = path(process.out.file_out.get(0).get(1)).linesGzip + assertAll( + { assert process.success }, + { assert snapshot( + lines[0..5], + lines.size(), + process.out.versions + ).match() + } + ) + } + } + + test("test_cat_zipped_unzipped") { + config './nextflow_zipped_unzipped.config' + + when { + params { + outdir = "${outputDir}" + } + process { + """ + input[0] = + [ + [ id:'test', single_end:true ], + [ + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.gff3.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/alignment/last/contigs.genome.maf.gz', checkIfExists: true) + ] + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("test_cat_unzipped_zipped") { + config './nextflow_unzipped_zipped.config' + when { + params { + outdir = "${outputDir}" + } + process { + """ + input[0] = + [ + [ id:'test', single_end:true ], + [ + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.sizes', checkIfExists: true) + ] + ] + """ + } + } + then { + def lines = path(process.out.file_out.get(0).get(1)).linesGzip + assertAll( + { assert process.success }, + { assert snapshot( + lines[0..5], + lines.size(), + process.out.versions + ).match() + } + ) + } + } + + test("test_cat_one_file_unzipped_zipped") { + config './nextflow_unzipped_zipped.config' + when { + params { + outdir = "${outputDir}" + } + process { + """ + input[0] = + [ + [ id:'test', single_end:true ], + [ + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) + ] + ] + """ + } + } + then { + def lines = path(process.out.file_out.get(0).get(1)).linesGzip + assertAll( + { assert process.success }, + { assert snapshot( + lines[0..5], + lines.size(), + process.out.versions + ).match() + } + ) + } + } +} diff --git a/modules/nf-core/cat/cat/tests/main.nf.test.snap b/modules/nf-core/cat/cat/tests/main.nf.test.snap new file mode 100644 index 00000000..b7623ee6 --- /dev/null +++ b/modules/nf-core/cat/cat/tests/main.nf.test.snap @@ -0,0 +1,147 @@ +{ + "test_cat_unzipped_unzipped": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": true + }, + "test.fasta:md5,f44b33a0e441ad58b2d3700270e2dbe2" + ] + ], + "1": [ + "versions.yml:md5,115ed6177ebcff24eb99d503fa5ef894" + ], + "file_out": [ + [ + { + "id": "test", + "single_end": true + }, + "test.fasta:md5,f44b33a0e441ad58b2d3700270e2dbe2" + ] + ], + "versions": [ + "versions.yml:md5,115ed6177ebcff24eb99d503fa5ef894" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.3" + }, + "timestamp": "2023-10-16T14:32:18.500464399" + }, + "test_cat_zipped_unzipped": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": true + }, + "cat.txt:md5,c439d3b60e7bc03e8802a451a0d9a5d9" + ] + ], + "1": [ + "versions.yml:md5,115ed6177ebcff24eb99d503fa5ef894" + ], + "file_out": [ + [ + { + "id": "test", + "single_end": true + }, + "cat.txt:md5,c439d3b60e7bc03e8802a451a0d9a5d9" + ] + ], + "versions": [ + "versions.yml:md5,115ed6177ebcff24eb99d503fa5ef894" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.3" + }, + "timestamp": "2023-10-16T14:32:49.642741302" + }, + "test_cat_zipped_zipped": { + "content": [ + [ + "MT192765.1\tGenbank\ttranscript\t259\t29667\t.\t+\t.\tID=unknown_transcript_1;geneID=orf1ab;gene_name=orf1ab", + "MT192765.1\tGenbank\tgene\t259\t21548\t.\t+\t.\tParent=unknown_transcript_1", + "MT192765.1\tGenbank\tCDS\t259\t13461\t.\t+\t0\tParent=unknown_transcript_1;exception=\"ribosomal slippage\";gbkey=CDS;gene=orf1ab;note=\"pp1ab;translated=by -1 ribosomal frameshift\";product=\"orf1ab polyprotein\";protein_id=QIK50426.1", + "MT192765.1\tGenbank\tCDS\t13461\t21548\t.\t+\t0\tParent=unknown_transcript_1;exception=\"ribosomal slippage\";gbkey=CDS;gene=orf1ab;note=\"pp1ab;translated=by -1 ribosomal frameshift\";product=\"orf1ab polyprotein\";protein_id=QIK50426.1", + "MT192765.1\tGenbank\tCDS\t21556\t25377\t.\t+\t0\tParent=unknown_transcript_1;gbkey=CDS;gene=S;note=\"structural protein\";product=\"surface glycoprotein\";protein_id=QIK50427.1", + "MT192765.1\tGenbank\tgene\t21556\t25377\t.\t+\t.\tParent=unknown_transcript_1" + ], + 78, + [ + "versions.yml:md5,115ed6177ebcff24eb99d503fa5ef894" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.3" + }, + "timestamp": "2024-07-22T11:51:46.802978" + }, + "test_cat_name_conflict": { + "content": [ + [ + + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.3" + }, + "timestamp": "2024-07-22T11:51:29.45394" + }, + "test_cat_one_file_unzipped_zipped": { + "content": [ + [ + ">MT192765.1 Severe acute respiratory syndrome coronavirus 2 isolate SARS-CoV-2/human/USA/PC00101P/2020, complete genome", + "GTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGTAGATCTGTTCTCTAAACGAACTTTAAAATCTGT", + "GTGGCTGTCACTCGGCTGCATGCTTAGTGCACTCACGCAGTATAATTAATAACTAATTACTGTCGTTGACAGGACACGAG", + "TAACTCGTCTATCTTCTGCAGGCTGCTTACGGTTTCGTCCGTGTTGCAGCCGATCATCAGCACATCTAGGTTTTGTCCGG", + "GTGTGACCGAAAGGTAAGATGGAGAGCCTTGTCCCTGGTTTCAACGAGAAAACACACGTCCAACTCAGTTTGCCTGTTTT", + "ACAGGTTCGCGACGTGCTCGTACGTGGCTTTGGAGACTCCGTGGAGGAGGTCTTATCAGAGGCACGTCAACATCTTAAAG" + ], + 374, + [ + "versions.yml:md5,115ed6177ebcff24eb99d503fa5ef894" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.3" + }, + "timestamp": "2024-07-22T11:52:02.774016" + }, + "test_cat_unzipped_zipped": { + "content": [ + [ + ">MT192765.1 Severe acute respiratory syndrome coronavirus 2 isolate SARS-CoV-2/human/USA/PC00101P/2020, complete genome", + "GTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGTAGATCTGTTCTCTAAACGAACTTTAAAATCTGT", + "GTGGCTGTCACTCGGCTGCATGCTTAGTGCACTCACGCAGTATAATTAATAACTAATTACTGTCGTTGACAGGACACGAG", + "TAACTCGTCTATCTTCTGCAGGCTGCTTACGGTTTCGTCCGTGTTGCAGCCGATCATCAGCACATCTAGGTTTTGTCCGG", + "GTGTGACCGAAAGGTAAGATGGAGAGCCTTGTCCCTGGTTTCAACGAGAAAACACACGTCCAACTCAGTTTGCCTGTTTT", + "ACAGGTTCGCGACGTGCTCGTACGTGGCTTTGGAGACTCCGTGGAGGAGGTCTTATCAGAGGCACGTCAACATCTTAAAG" + ], + 375, + [ + "versions.yml:md5,115ed6177ebcff24eb99d503fa5ef894" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.3" + }, + "timestamp": "2024-07-22T11:51:57.581523" + } +} \ No newline at end of file diff --git a/modules/nf-core/cat/cat/tests/nextflow_unzipped_zipped.config b/modules/nf-core/cat/cat/tests/nextflow_unzipped_zipped.config new file mode 100644 index 00000000..ec26b0fd --- /dev/null +++ b/modules/nf-core/cat/cat/tests/nextflow_unzipped_zipped.config @@ -0,0 +1,6 @@ + +process { + withName: CAT_CAT { + ext.prefix = 'cat.txt.gz' + } +} diff --git a/modules/nf-core/cat/cat/tests/nextflow_zipped_unzipped.config b/modules/nf-core/cat/cat/tests/nextflow_zipped_unzipped.config new file mode 100644 index 00000000..fbc79783 --- /dev/null +++ b/modules/nf-core/cat/cat/tests/nextflow_zipped_unzipped.config @@ -0,0 +1,8 @@ + +process { + + withName: CAT_CAT { + ext.prefix = 'cat.txt' + } + +} diff --git a/modules/nf-core/tabix/bgziptabix/environment.yml b/modules/nf-core/tabix/bgziptabix/environment.yml new file mode 100644 index 00000000..771b1387 --- /dev/null +++ b/modules/nf-core/tabix/bgziptabix/environment.yml @@ -0,0 +1,8 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda + +dependencies: + - bioconda::htslib=1.21 diff --git a/modules/nf-core/tabix/bgziptabix/main.nf b/modules/nf-core/tabix/bgziptabix/main.nf new file mode 100644 index 00000000..f295c7f2 --- /dev/null +++ b/modules/nf-core/tabix/bgziptabix/main.nf @@ -0,0 +1,48 @@ +process TABIX_BGZIPTABIX { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/92/92859404d861ae01afb87e2b789aebc71c0ab546397af890c7df74e4ee22c8dd/data' : + 'community.wave.seqera.io/library/htslib:1.21--ff8e28a189fbecaa' }" + + input: + tuple val(meta), path(input) + + output: + tuple val(meta), path("*.gz"), path("*.tbi"), optional: true, emit: gz_tbi + tuple val(meta), path("*.gz"), path("*.csi"), optional: true, emit: gz_csi + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def args2 = task.ext.args2 ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + bgzip --threads ${task.cpus} -c $args $input > ${prefix}.${input.getExtension()}.gz + tabix --threads ${task.cpus} $args2 ${prefix}.${input.getExtension()}.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + tabix: \$(echo \$(tabix -h 2>&1) | sed 's/^.*Version: //; s/ .*\$//') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + def args2 = task.ext.args2 ?: '' + def index = args2.contains("-C ") || args2.contains("--csi") ? "csi" : "tbi" + """ + echo "" | gzip > ${prefix}.${input.getExtension()}.gz + touch ${prefix}.${input.getExtension()}.gz.${index} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + tabix: \$(echo \$(tabix -h 2>&1) | sed 's/^.*Version: //; s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/tabix/bgziptabix/meta.yml b/modules/nf-core/tabix/bgziptabix/meta.yml new file mode 100644 index 00000000..9c2c46d1 --- /dev/null +++ b/modules/nf-core/tabix/bgziptabix/meta.yml @@ -0,0 +1,74 @@ +name: tabix_bgziptabix +description: bgzip a sorted tab-delimited genome file and then create tabix index +keywords: + - bgzip + - compress + - index + - tabix + - vcf +tools: + - tabix: + description: Generic indexer for TAB-delimited genome position files. + homepage: https://www.htslib.org/doc/tabix.html + documentation: https://www.htslib.org/doc/tabix.1.html + doi: 10.1093/bioinformatics/btq671 + licence: ["MIT"] + identifier: biotools:tabix +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - input: + type: file + description: Sorted tab-delimited genome file + ontologies: [] +output: + gz_tbi: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.gz": + type: file + description: bgzipped tab-delimited genome file + pattern: "*.gz" + ontologies: + - edam: http://edamontology.org/format_3989 # GZIP format + - "*.tbi": + type: file + description: tabix index file + pattern: "*.tbi" + ontologies: [] + gz_csi: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.gz": + type: file + description: bgzipped tab-delimited genome file + pattern: "*.gz" + ontologies: + - edam: http://edamontology.org/format_3989 # GZIP format + - "*.csi": + type: file + description: csi index file + pattern: "*.csi" + ontologies: [] + versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" + ontologies: + - edam: http://edamontology.org/format_3750 # YAML +authors: + - "@maxulysse" + - "@DLBPointon" +maintainers: + - "@maxulysse" + - "@DLBPointon" diff --git a/modules/nf-core/tabix/bgziptabix/tests/main.nf.test b/modules/nf-core/tabix/bgziptabix/tests/main.nf.test new file mode 100644 index 00000000..cdb016e5 --- /dev/null +++ b/modules/nf-core/tabix/bgziptabix/tests/main.nf.test @@ -0,0 +1,123 @@ +nextflow_process { + + name "Test Process TABIX_BGZIPTABIX" + script "../main.nf" + process "TABIX_BGZIPTABIX" + + tag "modules" + tag "modules_nfcore" + tag "tabix" + tag "tabix/bgziptabix" + + test("sarscov2_bed_tbi") { + config "./tabix_tbi.config" + + when { + process { + """ + input[0] = [ + [ id:'tbi_test' ], + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/bed/test.bed', checkIfExists: true) ] + ] + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out).match() }, + { assert snapshot( + file(process.out.gz_tbi[0][1]).name + ).match("tbi_test") + } + ) + } + } + + test("sarscov2_bed_csi") { + config "./tabix_csi.config" + + when { + process { + """ + input[0] = [ + [ id:'csi_test' ], + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/bed/test.bed', checkIfExists: true) ] + ] + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out).match() }, + { assert snapshot( + file(process.out.gz_csi[0][1]).name + ).match("csi_test") + } + ) + } + + } + + test("sarscov2_bed_csi_stub") { + config "./tabix_csi.config" + + options "-stub" + + when { + process { + """ + input[0] = [ + [ id:'test' ], + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/bed/test.bed', checkIfExists: true) ] + ] + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out).match() }, + { assert snapshot( + file(process.out.gz_csi[0][1]).name + ).match("csi_stub") + } + ) + } + + } + + test("sarscov2_bed_tbi_stub") { + config "./tabix_tbi.config" + + options "-stub" + + when { + process { + """ + input[0] = [ + [ id:'test' ], + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/bed/test.bed', checkIfExists: true) ] + ] + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out).match() }, + { assert snapshot( + file(process.out.gz_tbi[0][1]).name + ).match("tbi_stub") + } + ) + } + + } + +} diff --git a/modules/nf-core/tabix/bgziptabix/tests/main.nf.test.snap b/modules/nf-core/tabix/bgziptabix/tests/main.nf.test.snap new file mode 100644 index 00000000..5f818045 --- /dev/null +++ b/modules/nf-core/tabix/bgziptabix/tests/main.nf.test.snap @@ -0,0 +1,206 @@ +{ + "sarscov2_bed_tbi": { + "content": [ + { + "0": [ + [ + { + "id": "tbi_test" + }, + "tbi_test.bed.gz:md5,fe4053cf4de3aebbdfc3be2efb125a74", + "tbi_test.bed.gz.tbi:md5,ca06caf88b1e3c67d5fcba0a1460b52c" + ] + ], + "1": [ + + ], + "2": [ + "versions.yml:md5,9a7904908d7400fc67ef0412a925e9fc" + ], + "gz_csi": [ + + ], + "gz_tbi": [ + [ + { + "id": "tbi_test" + }, + "tbi_test.bed.gz:md5,fe4053cf4de3aebbdfc3be2efb125a74", + "tbi_test.bed.gz.tbi:md5,ca06caf88b1e3c67d5fcba0a1460b52c" + ] + ], + "versions": [ + "versions.yml:md5,9a7904908d7400fc67ef0412a925e9fc" + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.5" + }, + "timestamp": "2025-03-26T13:52:30.53305451" + }, + "sarscov2_bed_csi": { + "content": [ + { + "0": [ + + ], + "1": [ + [ + { + "id": "csi_test" + }, + "csi_test.bed.gz:md5,fe4053cf4de3aebbdfc3be2efb125a74", + "csi_test.bed.gz.csi:md5,c9c0377de58fdc89672bb3005a0d69f5" + ] + ], + "2": [ + "versions.yml:md5,9a7904908d7400fc67ef0412a925e9fc" + ], + "gz_csi": [ + [ + { + "id": "csi_test" + }, + "csi_test.bed.gz:md5,fe4053cf4de3aebbdfc3be2efb125a74", + "csi_test.bed.gz.csi:md5,c9c0377de58fdc89672bb3005a0d69f5" + ] + ], + "gz_tbi": [ + + ], + "versions": [ + "versions.yml:md5,9a7904908d7400fc67ef0412a925e9fc" + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.5" + }, + "timestamp": "2025-03-26T13:52:34.152301569" + }, + "csi_test": { + "content": [ + "csi_test.bed.gz" + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-02-19T14:51:00.548801" + }, + "sarscov2_bed_tbi_stub": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.bed.gz:md5,68b329da9893e34099c7d8ad5cb9c940", + "test.bed.gz.tbi:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + + ], + "2": [ + "versions.yml:md5,9a7904908d7400fc67ef0412a925e9fc" + ], + "gz_csi": [ + + ], + "gz_tbi": [ + [ + { + "id": "test" + }, + "test.bed.gz:md5,68b329da9893e34099c7d8ad5cb9c940", + "test.bed.gz.tbi:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,9a7904908d7400fc67ef0412a925e9fc" + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.5" + }, + "timestamp": "2025-03-26T13:52:41.271812789" + }, + "csi_stub": { + "content": [ + "test.bed.gz" + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-02-19T14:51:09.218454" + }, + "tbi_stub": { + "content": [ + "test.bed.gz" + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.4" + }, + "timestamp": "2024-09-25T14:45:18.550930179" + }, + "tbi_test": { + "content": [ + "tbi_test.bed.gz" + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-02-19T14:50:51.579654" + }, + "sarscov2_bed_csi_stub": { + "content": [ + { + "0": [ + + ], + "1": [ + [ + { + "id": "test" + }, + "test.bed.gz:md5,68b329da9893e34099c7d8ad5cb9c940", + "test.bed.gz.csi:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "2": [ + "versions.yml:md5,9a7904908d7400fc67ef0412a925e9fc" + ], + "gz_csi": [ + [ + { + "id": "test" + }, + "test.bed.gz:md5,68b329da9893e34099c7d8ad5cb9c940", + "test.bed.gz.csi:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "gz_tbi": [ + + ], + "versions": [ + "versions.yml:md5,9a7904908d7400fc67ef0412a925e9fc" + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.5" + }, + "timestamp": "2025-03-26T13:52:37.709221651" + } +} \ No newline at end of file diff --git a/modules/nf-core/tabix/bgziptabix/tests/tabix_csi.config b/modules/nf-core/tabix/bgziptabix/tests/tabix_csi.config new file mode 100644 index 00000000..fb41a314 --- /dev/null +++ b/modules/nf-core/tabix/bgziptabix/tests/tabix_csi.config @@ -0,0 +1,5 @@ +process { + withName: TABIX_BGZIPTABIX { + ext.args2 = '-p vcf --csi' + } +} diff --git a/modules/nf-core/tabix/bgziptabix/tests/tabix_tbi.config b/modules/nf-core/tabix/bgziptabix/tests/tabix_tbi.config new file mode 100644 index 00000000..c1915dc4 --- /dev/null +++ b/modules/nf-core/tabix/bgziptabix/tests/tabix_tbi.config @@ -0,0 +1,5 @@ +process { + withName: TABIX_BGZIPTABIX { + ext.args2 = '-p vcf' + } +} \ No newline at end of file From b48b05ae0235a1dd6e412d67b521a80019d1716e Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Mon, 4 Aug 2025 15:58:41 +0100 Subject: [PATCH 03/58] Update and addition of new subworkflows --- subworkflows/local/accessory_files/main.nf | 4 +- subworkflows/local/telo_extraction/main.nf | 74 +++++++++++++++++++++ subworkflows/local/telo_finder/main.nf | 75 ++++++++++++++-------- 3 files changed, 126 insertions(+), 27 deletions(-) create mode 100644 subworkflows/local/telo_extraction/main.nf diff --git a/subworkflows/local/accessory_files/main.nf b/subworkflows/local/accessory_files/main.nf index 07121a29..83c0af65 100644 --- a/subworkflows/local/accessory_files/main.nf +++ b/subworkflows/local/accessory_files/main.nf @@ -80,7 +80,7 @@ workflow ACCESSORY_FILES { val_teloseq ) ch_versions = ch_versions.mix(TELO_FINDER.out.versions) - telo_file = TELO_FINDER.out.bedgraph_file.map{ it -> it[1] } + telo_file = TELO_FINDER.out.bedgraph_file } @@ -118,7 +118,7 @@ workflow ACCESSORY_FILES { emit: gap_file repeat_file - telo_file + telo_file // This is the possible collection of telomere files longread_output versions = ch_versions } diff --git a/subworkflows/local/telo_extraction/main.nf b/subworkflows/local/telo_extraction/main.nf new file mode 100644 index 00000000..73a955e6 --- /dev/null +++ b/subworkflows/local/telo_extraction/main.nf @@ -0,0 +1,74 @@ +include { GAWK as GAWK_CLEAN_TELOMERE } from '../../../modules/nf-core/gawk/main' +include { GAWK as GAWK_MAP_TELO } from '../../../modules/nf-core/gawk/main' +include { FIND_TELOMERE_WINDOWS } from '../../../modules/local/find/telomere_windows/main' +include { EXTRACT_TELO } from '../../../modules/local/extract/telo/main' +include { TABIX_BGZIPTABIX } from '../../../modules/nf-core/tabix/bgziptabix' + +workflow TELO_EXTRACTION { + take: + telomere_file //tuple(meta, file) + + main: + ch_versions = Channel.empty() + + // + // MODULE: CLEAN THE .TELOMERE FILE IF CONTAINS "you screwed up" ERROR MESSAGE + // (LIKELY WHEN USING LOWERCASE LETTERS OR BAD MOTIF) + // WORKS BE RETURNING LINES THAT START WITH '>' + // + GAWK_CLEAN_TELOMERE ( + telomere_file, + [], + false + ) + ch_versions = ch_versions.mix( GAWK_CLEAN_TELOMERE.out.versions ) + + + // + // MODULE: GENERATES A WINDOWS FILE FROM THE ABOVE + // + FIND_TELOMERE_WINDOWS ( + telomere_file + ) + ch_versions = ch_versions.mix( FIND_TELOMERE_WINDOWS.out.versions ) + + + def windows_file = FIND_TELOMERE_WINDOWS.out.windows + def fallback_file = GAWK_CLEAN_TELOMERE.out.output + + // Use EXTRACT_TELO if windows_file has content, otherwise fallback to GAWK_MAP_TELO + def safe_windows = windows_file.ifEmpty { Channel.empty() } + def fallback_valid = fallback_file.ifEmpty { Channel.empty() } + + EXTRACT_TELO( + safe_windows + ) + ch_versions = ch_versions.mix( EXTRACT_TELO.out.versions ) + + GAWK_MAP_TELO( + fallback_valid, + [], + false + ) + ch_gawk_output = GAWK_MAP_TELO.out.output.ifEmpty( Channel.empty() ) + ch_versions = ch_versions.mix( GAWK_MAP_TELO.out.versions ) + + // + // MODULE: Merge bed files into one for TABIX_BGZIPTABIX + // + // EXTRACT_TELO is the more important of the two, then we go to fallback, then just stop no point in running on empty file. + def merged_bed = EXTRACT_TELO.out.bed.ifEmpty { ch_gawk_output } + + + TABIX_BGZIPTABIX( + merged_bed + ) + ch_versions = ch_versions.mix( TABIX_BGZIPTABIX.out.versions ) + + emit: + bed_file = merged_bed + bed_gz_tbi = TABIX_BGZIPTABIX.out.gz_tbi + bedgraph_file = EXTRACT_TELO.out.bedgraph + versions = ch_versions + +} diff --git a/subworkflows/local/telo_finder/main.nf b/subworkflows/local/telo_finder/main.nf index d0d52123..7b9893a8 100644 --- a/subworkflows/local/telo_finder/main.nf +++ b/subworkflows/local/telo_finder/main.nf @@ -5,9 +5,9 @@ // include { GAWK as GAWK_UPPER_SEQUENCE } from '../../../modules/nf-core/gawk/main' include { FIND_TELOMERE_REGIONS } from '../../../modules/local/find/telomere_regions/main' -include { GAWK as GAWK_CLEAN_TELOMERE } from '../../../modules/nf-core/gawk/main' -include { FIND_TELOMERE_WINDOWS } from '../../../modules/local/find/telomere_windows/main' -include { EXTRACT_TELOMERE } from '../../../modules/local/extract/telomere/main' +include { GAWK as GAWK_SPLIT_DIRECTIONS } from '../../../modules/local/gawk/main' + +include { TELO_EXTRACTION } from '../../../subworkflows/local/telo_extraction/main' workflow TELO_FINDER { @@ -41,36 +41,61 @@ workflow TELO_FINDER { // - // MODULE: CLEAN THE .TELOMERE FILE IF CONTAINS "you screwed up" ERROR MESSAGE - // (LIKELY WHEN USING LOWERCASE LETTERS OR BAD MOTIF) - // WORKS BE RETURNING LINES THAT START WITH '>' + // MODULE: SPLIT THE TELOMERE FILE INTO 5' and 3' FILES + // THIS IS RUNNING ON A LOCAL VERSION OF THE GAWK MODULE // - GAWK_CLEAN_TELOMERE ( - FIND_TELOMERE_REGIONS.out.telomere, - [], - false - ) - ch_versions = ch_versions.mix( GAWK_CLEAN_TELOMERE.out.versions ) + if (params.split_telomere) { + GAWK_SPLIT_DIRECTIONS ( + FIND_TELOMERE_REGIONS.out.telomere, + file("${projectDir}/bin/gawk_split_directions.awk"), + false + ) + ch_versions = ch_versions.mix( GAWK_SPLIT_DIRECTIONS.out.versions ) + GAWK_SPLIT_DIRECTIONS.out.prime5 + .map { meta, file -> + tuple( [id: meta.id + "_5P"], file) + } + .set { prime5_telo } + + GAWK_SPLIT_DIRECTIONS.out.prime3 + .map { meta, file -> + tuple( [id: meta.id + "_3P"], file) + } + .set { prime3_telo } + + prime5_telo + .mix(prime3_telo) + .mix(FIND_TELOMERE_REGIONS.out.telomere) + .set { telo_for_extraction } + + } else { + telo_for_extraction = FIND_TELOMERE_REGIONS.out.telomere + } - // - // MODULE: GENERATES A WINDOWS FILE FROM THE ABOVE - // - FIND_TELOMERE_WINDOWS ( - GAWK_CLEAN_TELOMERE.out.output - ) - ch_versions = ch_versions.mix( FIND_TELOMERE_WINDOWS.out.versions ) // - // MODULE: EXTRACTS THE LOCATION OF TELOMERIC SEQUENCE BASED ON THE WINDOWS + // SUBWORKFLOW: TELO_EXTRACTION + // - The prime5.mix(prime3) creates a queue channel to execute + // TELO_EXTRACTION per item in channel // - EXTRACT_TELOMERE ( - FIND_TELOMERE_WINDOWS.out.windows + TELO_EXTRACTION ( + telo_for_extraction ) - ch_versions = ch_versions.mix( EXTRACT_TELOMERE.out.versions ) + ch_versions = ch_versions.mix( TELO_EXTRACTION.out.versions ) + + + TELO_EXTRACTION.out.bedgraph_file + .map{ _meta, bedgraph -> + bedgraph + } + .collect() + .set { telo_bedgraphs } + emit: - bed_file = EXTRACT_TELOMERE.out.bed - bedgraph_file = EXTRACT_TELOMERE.out.bedgraph + bed_file = TELO_EXTRACTION.out.bed_file.collect() // Not used anymore + bed_gz_tbi = TELO_EXTRACTION.out.bed_gz_tbi.collect() // Not used anymore + bedgraph_file = telo_bedgraphs // Used in pretext_graph versions = ch_versions } From c33946f2eb2e3e457e2676579ea6a6db31ac7e34 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Mon, 4 Aug 2025 15:59:11 +0100 Subject: [PATCH 04/58] Addition of split_telomere var --- workflows/curationpretext.nf | 2 ++ 1 file changed, 2 insertions(+) diff --git a/workflows/curationpretext.nf b/workflows/curationpretext.nf index 874da2cb..324dc90c 100644 --- a/workflows/curationpretext.nf +++ b/workflows/curationpretext.nf @@ -115,6 +115,7 @@ workflow CURATIONPRETEXT { cove_file, telo_file, rept_file, + params.split_telomere ) ch_versions = ch_versions.mix( PRETEXT_INGEST_SNDRD.out.versions ) @@ -130,6 +131,7 @@ workflow CURATIONPRETEXT { cove_file, telo_file, rept_file, + params.split_telomere ) ch_versions = ch_versions.mix( PRETEXT_INGEST_SNDRD.out.versions ) } From 3976812375513a10c992c965e0db093f4872d77d Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Mon, 4 Aug 2025 16:00:58 +0100 Subject: [PATCH 05/58] Update to files --- conf/modules.config | 11 +++++++++++ modules.json | 10 ++++++++++ nextflow.config | 1 + nextflow_schema.json | 7 +++++++ 4 files changed, 29 insertions(+) diff --git a/conf/modules.config b/conf/modules.config index 65623cdb..ccdb5010 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -91,6 +91,17 @@ process { ext.suffix = 'telomere' } + withName: 'GAWK_MAP_TELO' { + ext.args2 = { "-v OFS=\"\t\" 'BEGIN { sub(/^>/, \"\"); print \$1, \$4, \$5, \$6 }'" } + ext.prefix = { "${meta.id}_map_telo" } + ext.suffix = 'bed' + } + + withName: 'GAWK_SPLIT_DIRECTIONS' { + ext.prefix = { "${input}_telo" } + ext.suffix = 'telomere' + } + // // NOTE: GNU_SORT module derivatives // diff --git a/modules.json b/modules.json index 30d74ced..cf41394e 100644 --- a/modules.json +++ b/modules.json @@ -35,6 +35,11 @@ "git_sha": "a29f18660f5e3748d44d6f716241e70c942c065d", "installed_by": ["modules"] }, + "cat/cat": { + "branch": "master", + "git_sha": "41dfa3f7c0ffabb96a6a813fe321c6d1cc5b6e46", + "installed_by": ["modules"] + }, "gawk": { "branch": "master", "git_sha": "05954dab2ff481bcb999f24455da29a5828af08d", @@ -92,6 +97,11 @@ "git_sha": "05954dab2ff481bcb999f24455da29a5828af08d", "installed_by": ["modules"] }, + "tabix/bgziptabix": { + "branch": "master", + "git_sha": "f2cfcf9d3f6a2d123e6c44aefa788aa232204a7a", + "installed_by": ["modules"] + }, "ucsc/bedgraphtobigwig": { "branch": "master", "git_sha": "05954dab2ff481bcb999f24455da29a5828af08d", diff --git a/nextflow.config b/nextflow.config index f76a56fc..eb54c2ee 100644 --- a/nextflow.config +++ b/nextflow.config @@ -11,6 +11,7 @@ params { // Input options input = null + split_telomere = false skip_tracks = "NONE" sample = "pretext_rerun" teloseq = "TTAGGG" diff --git a/nextflow_schema.json b/nextflow_schema.json index 307a510b..b3d71f02 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -20,6 +20,13 @@ "help_text": "You need the input fasta file", "fa_icon": "fas fa-file-fasta" }, + "split_telomere": { + "type": "boolean", + "format": "boolean", + "description": "Split the telomere file into 5' and 3' files for seperate ingestion into the HiC maps", + "default": false, + "fa_icon": "fas fa-check" + }, "skip_tracks": { "type": "string", "description": "Skip generation for specified tracks", From 6e69c94f7cf5b9b635eabb00f869b1143161d6e1 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Mon, 4 Aug 2025 17:23:37 +0100 Subject: [PATCH 06/58] Update CHANGELOG --- CHANGELOG.md | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index c0ef4bc2..e2c8ccf9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,43 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [[1.5.0](https://github.com/sanger-tol/curationpretext/releases/tag/1.5.0)] - UNSC Punic - [2025-08-04] + +### Added and Fixed + +- Addition of the `--split_telomere` boolean flag, this is false by default. + - When `true` the pipeline will split the telomere file into a 5 and 3 prime file. +- Update `ACCESSORY_FILES` subworkflow: + - Remove `GET_LARGEST_SCAFFOLD` as we no longer need it, this was needed for TABIX so that the correct index file was used. This was used by the `TELO_FINDER` and `GAP_FINDER` subworkflows. +- Update `TELO_FINDER` subworkflow: + - Remove `GAWK_MAP_TELO` as it is no longer needed. + - Remove `GAWK_CLEAN_TELOMERE` as it is no longer needed. The reason for its inclusion has been fixed. + - Update `EXTRACT_TELO` to `EXTRACT_TELOMERE` which also removed the use of the `cat {file} | awk` pattern, replacing it with just `awk`. This was supposed to happen in `1.4.0`, but was forgotten with the files lying dormant in the repo. + - Refactor of the `TELO_FINDER` subworkflow, introducing the `TELO_EXTRACTION` subworkflow which is run per telo file. With the introduction of `split_telomere` this can be 3 files. +- Update `LONGREAD_COVERAGE` subworkflow: + - Remove `GRAPH_OVERALL_COVERAGE` as it is not in use. +- Better formatting in some files. +- Moved `GAWK_UPPER_SEQUENCE` from the `TELO_FINDER` subworkflow to the first step of the main `curationpretext` workflow, this simply makes more sense. + +### Paramters + +| Old Version | New Versions | +| ----------- | ---------------- | +| NA | --split_telomere | + +### Software Dependencies + +Note, since the pipeline is using Nextflow DSL2, each process will be run with its own Biocontainer. This means that on occasion it is entirely possible for the pipeline to be using different versions of the same tool. However, the overall software dependency changes compared to the last release have been listed below for reference. + +| Module | Old Version | New Versions | +| ------------------------ | ------------- | ------------- | +| `GRAPH_OVERALL_COVERAGE` | perl=5.26.2 | REMOVED | +| `EXTRACT_TELO` | coreutils=9.1 | REMOVED | +| `EXTRACT_TELOMERE` | NA | coreutils=9.1 | +| `GAWK_CLEAN_TELOMERE` | 5.3.0 | REMOVED | +| `GAWK_MAP_TELO` | 5.3.0 | REMOVED | +| `GET_LARGEST_SCAFF` | coreutils=9.1 | REMOVED | + ## [[1.4.2](https://github.com/sanger-tol/curationpretext/releases/tag/1.4.2)] - UNSC Nereid (H2) - [2025-07-28] ### Added and Fixed From fb289182ad88cd11b339915bd5ed4b05851e9149 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Mon, 4 Aug 2025 17:23:51 +0100 Subject: [PATCH 07/58] Removed modules --- modules.json | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/modules.json b/modules.json index cf41394e..30d74ced 100644 --- a/modules.json +++ b/modules.json @@ -35,11 +35,6 @@ "git_sha": "a29f18660f5e3748d44d6f716241e70c942c065d", "installed_by": ["modules"] }, - "cat/cat": { - "branch": "master", - "git_sha": "41dfa3f7c0ffabb96a6a813fe321c6d1cc5b6e46", - "installed_by": ["modules"] - }, "gawk": { "branch": "master", "git_sha": "05954dab2ff481bcb999f24455da29a5828af08d", @@ -97,11 +92,6 @@ "git_sha": "05954dab2ff481bcb999f24455da29a5828af08d", "installed_by": ["modules"] }, - "tabix/bgziptabix": { - "branch": "master", - "git_sha": "f2cfcf9d3f6a2d123e6c44aefa788aa232204a7a", - "installed_by": ["modules"] - }, "ucsc/bedgraphtobigwig": { "branch": "master", "git_sha": "05954dab2ff481bcb999f24455da29a5828af08d", From e7478473e9a25284745182002b32227e1f81fdf1 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Mon, 4 Aug 2025 17:24:25 +0100 Subject: [PATCH 08/58] Added GAWK_UPPED_SEQ to main workflow --- workflows/curationpretext.nf | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/workflows/curationpretext.nf b/workflows/curationpretext.nf index 324dc90c..6e63803d 100644 --- a/workflows/curationpretext.nf +++ b/workflows/curationpretext.nf @@ -5,11 +5,14 @@ */ include { SAMTOOLS_FAIDX } from '../modules/nf-core/samtools/faidx/main' -include { GENERATE_MAPS } from '../subworkflows/local/generate_maps/main' -include { ACCESSORY_FILES } from '../subworkflows/local/accessory_files/main' +include { GAWK as GAWK_UPPER_SEQUENCE } from '../modules/nf-core/gawk/main' + include { PRETEXT_GRAPH as PRETEXT_INGEST_SNDRD } from '../modules/local/pretext/graph/main' include { PRETEXT_GRAPH as PRETEXT_INGEST_HIRES } from '../modules/local/pretext/graph/main' +include { GENERATE_MAPS } from '../subworkflows/local/generate_maps/main' +include { ACCESSORY_FILES } from '../subworkflows/local/accessory_files/main' + include { paramsSummaryMap } from 'plugin/nf-schema' include { paramsSummaryMultiqc } from '../subworkflows/nf-core/utils_nfcore_pipeline' include { softwareVersionsToYAML } from '../subworkflows/nf-core/utils_nfcore_pipeline' @@ -32,11 +35,23 @@ workflow CURATIONPRETEXT { ch_empty_file = Channel.fromPath("${baseDir}/assets/EMPTY.txt") + // + // MODULE: UPPERCASE THE REFERENCE SEQUENCE + // + GAWK_UPPER_SEQUENCE( + ch_reference, + [], + false, + ) + ch_upper_ref = GAWK_UPPER_SEQUENCE.out.output + ch_versions = ch_versions.mix( GAWK_UPPER_SEQUENCE.out.versions ) + + // // MODULE: GENERATE INDEX OF REFERENCE FASTA // SAMTOOLS_FAIDX ( - ch_reference, + ch_upper_ref, [[],[]], false ) @@ -76,7 +91,7 @@ workflow CURATIONPRETEXT { // SUBWORKFLOW: GENERATE SUPPLEMENTARY FILES FOR PRETEXT INGESTION // ACCESSORY_FILES ( - ch_reference, + ch_upper_ref, ch_reads, val_teloseq, SAMTOOLS_FAIDX.out.fai @@ -96,7 +111,7 @@ workflow CURATIONPRETEXT { // - GENERATE_MAPS IS THE MINIMAL OUTPUT EXPECTED FROM THIS PIPELLINE // GENERATE_MAPS ( - ch_reference, + ch_upper_ref, ch_cram_reads, SAMTOOLS_FAIDX.out.fai ) From 1ef9c58133491daff343c606ca968fc87d0dbd89 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Mon, 4 Aug 2025 17:26:13 +0100 Subject: [PATCH 09/58] Update subworkflows to remove modules, reorganise files and add split_telo support --- subworkflows/local/accessory_files/main.nf | 16 +----- subworkflows/local/gap_finder/main.nf | 1 - subworkflows/local/longread_coverage/main.nf | 14 ++---- subworkflows/local/repeat_density/main.nf | 15 ++++++ subworkflows/local/telo_extraction/main.nf | 53 +++----------------- subworkflows/local/telo_finder/main.nf | 16 +----- 6 files changed, 28 insertions(+), 87 deletions(-) diff --git a/subworkflows/local/accessory_files/main.nf b/subworkflows/local/accessory_files/main.nf index 83c0af65..1f04bb04 100644 --- a/subworkflows/local/accessory_files/main.nf +++ b/subworkflows/local/accessory_files/main.nf @@ -9,7 +9,6 @@ include { REPEAT_DENSITY } from '../repeat_density/main' include { LONGREAD_COVERAGE } from '../longread_coverage/main' include { GAWK as GAWK_GENERATE_GENOME_FILE } from '../../../modules/nf-core/gawk/main' -include { GET_LARGEST_SCAFFOLD } from '../../../modules/local/get/largest_scaffold/main' workflow ACCESSORY_FILES { take: @@ -42,17 +41,6 @@ workflow ACCESSORY_FILES { ch_versions = ch_versions.mix( GAWK_GENERATE_GENOME_FILE.out.versions ) - // - // MODULE: Cut out the largest scaffold size and use as comparator against 512MB - // This is the cut off for TABIX using tbi indexes - // TODO: Investigate this as a pure groovy function. - // - GET_LARGEST_SCAFFOLD ( - GAWK_GENERATE_GENOME_FILE.out.output - ) - ch_versions = ch_versions.mix( GET_LARGEST_SCAFFOLD.out.versions ) - - // // SUBWORKFLOW: GENERATES A GAP.BED FILE TO ID THE LOCATIONS OF GAPS // @@ -60,8 +48,7 @@ workflow ACCESSORY_FILES { gap_file = ch_empty_file } else { GAP_FINDER ( - reference_tuple, - GET_LARGEST_SCAFFOLD.out.scaff_size.map{it -> it[1].toInteger()} + reference_tuple ) ch_versions = ch_versions.mix(GAP_FINDER.out.versions) gap_file = GAP_FINDER.out.gap_file.map{ it -> it[1] } @@ -75,7 +62,6 @@ workflow ACCESSORY_FILES { telo_file = ch_empty_file } else { TELO_FINDER ( - GET_LARGEST_SCAFFOLD.out.scaff_size.map{it -> it[1].toInteger()}, reference_tuple, val_teloseq ) diff --git a/subworkflows/local/gap_finder/main.nf b/subworkflows/local/gap_finder/main.nf index 406f7173..10ca907c 100644 --- a/subworkflows/local/gap_finder/main.nf +++ b/subworkflows/local/gap_finder/main.nf @@ -9,7 +9,6 @@ include { GAWK as GAWK_GAP_LENGTH } from '../../../modules/nf-core/gawk/main' workflow GAP_FINDER { take: reference_tuple // Channel [ val(meta), path(fasta) ] - max_scaff_size // val(size of largest scaffold in bp) main: ch_versions = Channel.empty() diff --git a/subworkflows/local/longread_coverage/main.nf b/subworkflows/local/longread_coverage/main.nf index e2e988e3..9fd1f927 100644 --- a/subworkflows/local/longread_coverage/main.nf +++ b/subworkflows/local/longread_coverage/main.nf @@ -11,7 +11,6 @@ include { SAMTOOLS_MERGE } from '../../../modules include { SAMTOOLS_SORT } from '../../../modules/nf-core/samtools/sort/main' include { SAMTOOLS_VIEW as SAMTOOLS_VIEW_FILTER_PRIMARY } from '../../../modules/nf-core/samtools/view/main' include { UCSC_BEDGRAPHTOBIGWIG } from '../../../modules/nf-core/ucsc/bedgraphtobigwig/main' -include { GRAPH_OVERALL_COVERAGE } from '../../../modules/local/graph/overall_coverage/main' workflow LONGREAD_COVERAGE { @@ -97,7 +96,9 @@ workflow LONGREAD_COVERAGE { // // MODULE: BAM TO PRIMARY BED // - BEDTOOLS_BAMTOBED(SAMTOOLS_VIEW_FILTER_PRIMARY.out.bam) + BEDTOOLS_BAMTOBED( + SAMTOOLS_VIEW_FILTER_PRIMARY.out.bam + ) ch_versions = ch_versions.mix(BEDTOOLS_BAMTOBED.out.versions) @@ -140,15 +141,6 @@ workflow LONGREAD_COVERAGE { ch_versions = ch_versions.mix( GNU_SORT.out.versions ) - // - // MODULE: GENERATE DEPTHGRAPH - // - GRAPH_OVERALL_COVERAGE( - GNU_SORT.out.sorted - ) - ch_versions = ch_versions.mix( GRAPH_OVERALL_COVERAGE.out.versions ) - - // // LOGIC: PREPARING NORMAL COVERAGE INPUT // diff --git a/subworkflows/local/repeat_density/main.nf b/subworkflows/local/repeat_density/main.nf index b691a5d0..ce4400d2 100644 --- a/subworkflows/local/repeat_density/main.nf +++ b/subworkflows/local/repeat_density/main.nf @@ -25,12 +25,15 @@ workflow REPEAT_DENSITY { main: ch_versions = Channel.empty() + + // // MODULE: MARK UP THE REPEAT REGIONS OF THE REFERENCE GENOME // WINDOWMASKER_MKCOUNTS ( reference_tuple ) ch_versions = ch_versions.mix( WINDOWMASKER_MKCOUNTS.out.versions ) + // // MODULE: CALCULATE THE STATISTICS OF THE MARKED UP REGIONS // @@ -38,18 +41,21 @@ workflow REPEAT_DENSITY { reference_tuple ) ch_versions = ch_versions.mix( WINDOWMASKER_USTAT.out.versions ) + // // MODULE: USE USTAT OUTPUT TO EXTRACT REPEATS FROM FASTA // EXTRACT_REPEAT( WINDOWMASKER_USTAT.out.intervals ) ch_versions = ch_versions.mix( EXTRACT_REPEAT.out.versions ) + // // MODULE: CREATE WINDOWS FROM .GENOME FILE // BEDTOOLS_MAKEWINDOWS( dot_genome ) ch_versions = ch_versions.mix( BEDTOOLS_MAKEWINDOWS.out.versions ) + // // LOGIC: COMBINE TWO CHANNELS AND OUTPUT tuple(meta, windows_file, repeat_file) // @@ -63,6 +69,7 @@ workflow REPEAT_DENSITY { } .set { intervals } + // // MODULE: GENERATES THE REPEAT FILE FROM THE WINDOW FILE AND GENOME FILE // @@ -72,6 +79,7 @@ workflow REPEAT_DENSITY { ) ch_versions = ch_versions.mix( BEDTOOLS_INTERSECT.out.versions ) + // // MODULE: FIXES IDS FOR REPEATS // @@ -82,6 +90,7 @@ workflow REPEAT_DENSITY { ) ch_versions = ch_versions.mix( GAWK_RENAME_IDS.out.versions ) + // // MODULE: SORTS THE ABOVE BED FILES // @@ -94,6 +103,7 @@ workflow REPEAT_DENSITY { GNU_SORT_C ( BEDTOOLS_MAKEWINDOWS.out.bed ) // windows file ch_versions = ch_versions.mix( GNU_SORT_C.out.versions ) + // // MODULE: ADDS 4TH COLUMN TO BED FILE USED IN THE REPEAT DENSITY GRAPH // @@ -104,6 +114,7 @@ workflow REPEAT_DENSITY { ) ch_versions = ch_versions.mix( GAWK_REFORMAT_INTERSECT.out.versions ) + // // LOGIC: COMBINES THE REFORMATTED INTERSECT FILE AND WINDOWS FILE CHANNELS AND SORTS INTO // tuple(intersect_meta, windows file, intersect file) @@ -118,6 +129,7 @@ workflow REPEAT_DENSITY { } .set { for_mapping } + // // MODULE: MAPS THE REPEATS AGAINST THE REFERENCE GENOME // @@ -127,6 +139,7 @@ workflow REPEAT_DENSITY { ) ch_versions = ch_versions.mix( BEDTOOLS_MAP.out.versions ) + // // MODULE: REPLACES . WITH 0 IN MAPPED FILE // @@ -137,6 +150,7 @@ workflow REPEAT_DENSITY { ) ch_versions = ch_versions.mix( GAWK_REPLACE_DOTS.out.versions ) + // // MODULE: CONVERTS GENOME FILE AND BED INTO A BIGWIG FILE // @@ -146,6 +160,7 @@ workflow REPEAT_DENSITY { ) ch_versions = ch_versions.mix( UCSC_BEDGRAPHTOBIGWIG.out.versions ) + emit: repeat_density = UCSC_BEDGRAPHTOBIGWIG.out.bigwig versions = ch_versions diff --git a/subworkflows/local/telo_extraction/main.nf b/subworkflows/local/telo_extraction/main.nf index 73a955e6..e5bfd667 100644 --- a/subworkflows/local/telo_extraction/main.nf +++ b/subworkflows/local/telo_extraction/main.nf @@ -1,8 +1,5 @@ -include { GAWK as GAWK_CLEAN_TELOMERE } from '../../../modules/nf-core/gawk/main' -include { GAWK as GAWK_MAP_TELO } from '../../../modules/nf-core/gawk/main' include { FIND_TELOMERE_WINDOWS } from '../../../modules/local/find/telomere_windows/main' -include { EXTRACT_TELO } from '../../../modules/local/extract/telo/main' -include { TABIX_BGZIPTABIX } from '../../../modules/nf-core/tabix/bgziptabix' +include { EXTRACT_TELOMERE } from '../../../modules/local/extract/telomere/main' workflow TELO_EXTRACTION { take: @@ -11,19 +8,6 @@ workflow TELO_EXTRACTION { main: ch_versions = Channel.empty() - // - // MODULE: CLEAN THE .TELOMERE FILE IF CONTAINS "you screwed up" ERROR MESSAGE - // (LIKELY WHEN USING LOWERCASE LETTERS OR BAD MOTIF) - // WORKS BE RETURNING LINES THAT START WITH '>' - // - GAWK_CLEAN_TELOMERE ( - telomere_file, - [], - false - ) - ch_versions = ch_versions.mix( GAWK_CLEAN_TELOMERE.out.versions ) - - // // MODULE: GENERATES A WINDOWS FILE FROM THE ABOVE // @@ -34,41 +18,20 @@ workflow TELO_EXTRACTION { def windows_file = FIND_TELOMERE_WINDOWS.out.windows - def fallback_file = GAWK_CLEAN_TELOMERE.out.output - - // Use EXTRACT_TELO if windows_file has content, otherwise fallback to GAWK_MAP_TELO def safe_windows = windows_file.ifEmpty { Channel.empty() } - def fallback_valid = fallback_file.ifEmpty { Channel.empty() } - - EXTRACT_TELO( - safe_windows - ) - ch_versions = ch_versions.mix( EXTRACT_TELO.out.versions ) - - GAWK_MAP_TELO( - fallback_valid, - [], - false - ) - ch_gawk_output = GAWK_MAP_TELO.out.output.ifEmpty( Channel.empty() ) - ch_versions = ch_versions.mix( GAWK_MAP_TELO.out.versions ) // - // MODULE: Merge bed files into one for TABIX_BGZIPTABIX + // MODULE: Extract the telomere data from the FIND_TELOMERE + // file and reformat into bed // - // EXTRACT_TELO is the more important of the two, then we go to fallback, then just stop no point in running on empty file. - def merged_bed = EXTRACT_TELO.out.bed.ifEmpty { ch_gawk_output } - - - TABIX_BGZIPTABIX( - merged_bed + EXTRACT_TELOMERE( + safe_windows ) - ch_versions = ch_versions.mix( TABIX_BGZIPTABIX.out.versions ) + ch_versions = ch_versions.mix( EXTRACT_TELOMERE.out.versions ) + emit: - bed_file = merged_bed - bed_gz_tbi = TABIX_BGZIPTABIX.out.gz_tbi - bedgraph_file = EXTRACT_TELO.out.bedgraph + bedgraph_file = EXTRACT_TELOMERE.out.bedgraph versions = ch_versions } diff --git a/subworkflows/local/telo_finder/main.nf b/subworkflows/local/telo_finder/main.nf index 7b9893a8..ce827a3d 100644 --- a/subworkflows/local/telo_finder/main.nf +++ b/subworkflows/local/telo_finder/main.nf @@ -3,7 +3,6 @@ // // MODULE IMPORT BLOCK // -include { GAWK as GAWK_UPPER_SEQUENCE } from '../../../modules/nf-core/gawk/main' include { FIND_TELOMERE_REGIONS } from '../../../modules/local/find/telomere_regions/main' include { GAWK as GAWK_SPLIT_DIRECTIONS } from '../../../modules/local/gawk/main' @@ -12,7 +11,6 @@ include { TELO_EXTRACTION } from '../../../subworkflows/local/telo workflow TELO_FINDER { take: - max_scaff_size // val(size of largest scaffold in bp) reference_tuple // Channel [ val(meta), path(fasta) ] teloseq @@ -20,21 +18,11 @@ workflow TELO_FINDER { ch_versions = Channel.empty() - // - // MODULE: UPPERCASE THE REFERENCE SEQUENCE - // - GAWK_UPPER_SEQUENCE( - reference_tuple, - [], - false, - ) - ch_versions = ch_versions.mix( GAWK_UPPER_SEQUENCE.out.versions ) - // // MODULE: FINDS THE TELOMERIC SEQEUNCE IN REFERENCE // FIND_TELOMERE_REGIONS ( - GAWK_UPPER_SEQUENCE.out.output, + reference_tuple, teloseq ) ch_versions = ch_versions.mix( FIND_TELOMERE_REGIONS.out.versions ) @@ -94,8 +82,6 @@ workflow TELO_FINDER { emit: - bed_file = TELO_EXTRACTION.out.bed_file.collect() // Not used anymore - bed_gz_tbi = TELO_EXTRACTION.out.bed_gz_tbi.collect() // Not used anymore bedgraph_file = telo_bedgraphs // Used in pretext_graph versions = ch_versions } From a298944dc9ba1da667371c6a8649f5f1963ba195 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Mon, 4 Aug 2025 17:26:30 +0100 Subject: [PATCH 10/58] Update files --- conf/modules.config | 12 ------------ conf/test.config | 19 ++++++++++--------- conf/test_full.config | 20 +++++++++++--------- 3 files changed, 21 insertions(+), 30 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index ccdb5010..02b8162f 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -85,18 +85,6 @@ process { ext.suffix = 'fasta' } - withName: 'GAWK_CLEAN_TELOMERE' { - ext.args2 = "'/^>/'" - ext.prefix = { "${meta.id}_CLEAN" } - ext.suffix = 'telomere' - } - - withName: 'GAWK_MAP_TELO' { - ext.args2 = { "-v OFS=\"\t\" 'BEGIN { sub(/^>/, \"\"); print \$1, \$4, \$5, \$6 }'" } - ext.prefix = { "${meta.id}_map_telo" } - ext.suffix = 'bed' - } - withName: 'GAWK_SPLIT_DIRECTIONS' { ext.prefix = { "${input}_telo" } ext.suffix = 'telomere' diff --git a/conf/test.config b/conf/test.config index 80d23e85..f98582dd 100644 --- a/conf/test.config +++ b/conf/test.config @@ -22,13 +22,14 @@ params { config_profile_name = 'Full test profile' config_profile_description = 'Full test dataset to check pipeline function' - input = "${baseDir}/TreeValTinyData/assembly/draft/grTriPseu1.fa" - reads = "${baseDir}/TreeValTinyData/genomic_data/pacbio/" - cram = "${baseDir}/TreeValTinyData/genomic_data/hic-arima/" - sample = "CurationPretextTest" - teloseq = "TTAGGG" - aligner = "bwamem2" - all_output = false - skip_tracks = "NONE" - run_hires = false + input = "${baseDir}/TreeValTinyData/assembly/draft/grTriPseu1.fa" + reads = "${baseDir}/TreeValTinyData/genomic_data/pacbio/" + cram = "${baseDir}/TreeValTinyData/genomic_data/hic-arima/" + sample = "CurationPretextTest" + teloseq = "TTAGGG" + aligner = "bwamem2" + all_output = false + skip_tracks = "NONE" + run_hires = false + split_telomere = true } diff --git a/conf/test_full.config b/conf/test_full.config index e164c0aa..3166bfd1 100644 --- a/conf/test_full.config +++ b/conf/test_full.config @@ -20,13 +20,15 @@ params { // Input data for full size test // Limit resources so that this can run on GitHub Actions - sample = "testing" - input = "/nfs/treeoflife-01/resources/nextflow/test-data/resources/treeval/TreeValTinyData/assembly/draft/grTriPseu1.fa" - reads = "/nfs/treeoflife-01/resources/nextflow/test-data/resources/treeval/TreeValTinyData/genomic_data/pacbio/" - cram = "/nfs/treeoflife-01/resources/nextflow/test-data/resources/treeval/TreeValTinyData/genomic_data/hic-arima/" - sample = "CurationPretextTest" - teloseq = "TTAGGG" - aligner = "bwamem2" - all_output = true - skip_tracks = "NONE" + sample = "testing" + input = "/nfs/treeoflife-01/resources/nextflow/test-data/resources/treeval/TreeValTinyData/assembly/draft/grTriPseu1.fa" + reads = "/nfs/treeoflife-01/resources/nextflow/test-data/resources/treeval/TreeValTinyData/genomic_data/pacbio/" + cram = "/nfs/treeoflife-01/resources/nextflow/test-data/resources/treeval/TreeValTinyData/genomic_data/hic-arima/" + sample = "CurationPretextTest" + teloseq = "TTAGGG" + aligner = "bwamem2" + all_output = true + skip_tracks = "NONE" + split_telomere = true + } From 6d6f75724ed4916b0ba8974a9288a14bda9a84a3 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Mon, 4 Aug 2025 17:32:05 +0100 Subject: [PATCH 11/58] support split_telomere --- bin/gawk_split_directions.awk | 8 ++++++++ 1 file changed, 8 insertions(+) create mode 100644 bin/gawk_split_directions.awk diff --git a/bin/gawk_split_directions.awk b/bin/gawk_split_directions.awk new file mode 100644 index 00000000..df82aa10 --- /dev/null +++ b/bin/gawk_split_directions.awk @@ -0,0 +1,8 @@ +## Split telomere file based on column 4 contents +## Date: 03/07/2025 + +BEGIN { + FS="\t"; OFS="\t" +} { + print > "direction."$3".telomere" +} From 88bea91760f9ea7500a72f6f9fe467d03797c8b5 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Mon, 4 Aug 2025 17:34:23 +0100 Subject: [PATCH 12/58] Additions --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index e2c8ccf9..0e297302 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -20,6 +20,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Remove `GRAPH_OVERALL_COVERAGE` as it is not in use. - Better formatting in some files. - Moved `GAWK_UPPER_SEQUENCE` from the `TELO_FINDER` subworkflow to the first step of the main `curationpretext` workflow, this simply makes more sense. +- Removed no longer needed scripts from bin. +- Added the `gawk_split_directions.awk` script for split telomere. ### Paramters From 39dc704a7132942319d4e38229d49e68bc79f267 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Mon, 4 Aug 2025 17:40:26 +0100 Subject: [PATCH 13/58] Update Version --- nextflow.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow.config b/nextflow.config index eb54c2ee..9b225f43 100644 --- a/nextflow.config +++ b/nextflow.config @@ -260,7 +260,7 @@ manifest { mainScript = 'main.nf' defaultBranch = 'main' nextflowVersion = '!>=24.04.2' - version = '1.4.2' + version = '1.5.0' doi = '10.5281/zenodo.12773958' } From 426325392294a4426f232e1bedc7ff1fd8df3535 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Mon, 4 Aug 2025 19:58:39 +0100 Subject: [PATCH 14/58] Updated test --- tests/main.nf.test.snap | 43 ++++++++++++++++++++++------------------- 1 file changed, 23 insertions(+), 20 deletions(-) diff --git a/tests/main.nf.test.snap b/tests/main.nf.test.snap index 7815abe3..32259ee9 100644 --- a/tests/main.nf.test.snap +++ b/tests/main.nf.test.snap @@ -1,7 +1,7 @@ { "Full run": { "content": [ - 40, + 42, { "BEDTOOLS_BAMTOBED": { "bedtools": "2.31.1" @@ -43,9 +43,6 @@ "FIND_TELOMERE_WINDOWS": { "telomere": 1.0 }, - "GAWK_CLEAN_TELOMERE": { - "gawk": "5.3.0" - }, "GAWK_GAP_LENGTH": { "gawk": "5.3.0" }, @@ -61,12 +58,11 @@ "GAWK_REPLACE_DOTS": { "gawk": "5.3.0" }, - "GAWK_UPPER_SEQUENCE": { + "GAWK_SPLIT_DIRECTIONS": { "gawk": "5.3.0" }, - "GET_LARGEST_SCAFFOLD": { - "get_largest_scaffold": 2.0, - "coreutils": 9.1 + "GAWK_UPPER_SEQUENCE": { + "gawk": "5.3.0" }, "GNU_SORT": { "coreutils": 9.3 @@ -80,10 +76,6 @@ "GNU_SORT_C": { "coreutils": 9.3 }, - "GRAPH_OVERALL_COVERAGE": { - "perl": "(v5.26.2))", - "graph_overall_coverage.pl": 1.0 - }, "MINIMAP2_ALIGN": { "minimap2": "2.28-r1209", "samtools": 1.2 @@ -125,13 +117,17 @@ "windowmasker": "1.0.0" }, "Workflow": { - "sanger-tol/curationpretext": "v1.4.2" + "sanger-tol/curationpretext": "v1.5.0" } }, [ "accessory_files", "accessory_files/CurationPretextTest.bigWig", "accessory_files/CurationPretextTest.gap.bedgraph", + "accessory_files/CurationPretextTest_3P_telomere.bed", + "accessory_files/CurationPretextTest_3P_telomere.bedgraph", + "accessory_files/CurationPretextTest_5P_telomere.bed", + "accessory_files/CurationPretextTest_5P_telomere.bedgraph", "accessory_files/CurationPretextTest_telomere.bed", "accessory_files/CurationPretextTest_telomere.bedgraph", "accessory_files/coverage.bigWig", @@ -139,24 +135,31 @@ "pipeline_info/sanger-tol_curationpretext_software_versions.yml", "pretext_maps_processed", "pretext_maps_processed/CurationPretextTest_normal.pretext", + "pretext_maps_processed/telo_0.pretext", + "pretext_maps_processed/telo_1.pretext", + "pretext_maps_processed/telo_2.pretext", "pretext_maps_raw", "pretext_maps_raw/CurationPretextTest_normal_pi.pretext", "pretext_snapshot", "pretext_snapshot/CurationPretextTest_normalFullMap.png" ], - 14, + 21, [ "CurationPretextTest.bigWig:md5,3f66a9152d793a62f877b733c2336dfd", "CurationPretextTest.gap.bedgraph:md5,d41d8cd98f00b204e9800998ecf8427e", + "CurationPretextTest_3P_telomere.bed:md5,d41d8cd98f00b204e9800998ecf8427e", + "CurationPretextTest_3P_telomere.bedgraph:md5,d41d8cd98f00b204e9800998ecf8427e", + "CurationPretextTest_5P_telomere.bed:md5,d41d8cd98f00b204e9800998ecf8427e", + "CurationPretextTest_5P_telomere.bedgraph:md5,d41d8cd98f00b204e9800998ecf8427e", "CurationPretextTest_telomere.bed:md5,d41d8cd98f00b204e9800998ecf8427e", "CurationPretextTest_telomere.bedgraph:md5,d41d8cd98f00b204e9800998ecf8427e", - "coverage.bigWig:md5,2e474506c957152b231ac63c859f0b17" + "coverage.bigWig:md5,39b3e8b7751b33758087cafc9a3c689e" ], - 5, + 9, 1, false, true, - 1, + 4, false, true, 1, @@ -164,8 +167,8 @@ ], "meta": { "nf-test": "0.9.2", - "nextflow": "24.04.4" + "nextflow": "25.04.1" }, - "timestamp": "2025-04-16T11:23:34.556355" + "timestamp": "2025-08-04T17:47:27.212054464" } -} +} \ No newline at end of file From 6f1f44955077acbeae28c7178ef7671ecfe7e245 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Mon, 4 Aug 2025 22:38:21 +0100 Subject: [PATCH 15/58] Update tests --- .nf-core.yml | 2 +- CITATION.cff | 4 +- bin/findHalfcoverage.py | 177 --------------- bin/get_avgcov.sh | 17 -- bin/graph_overall_coverage.pl | 34 --- bin/longread_cov_log.py | 43 ---- modules/local/get/largest_scaffold/main.nf | 43 ---- modules/local/graph/overall_coverage/main.nf | 43 ---- modules/nf-core/cat/cat/environment.yml | 7 - modules/nf-core/cat/cat/main.nf | 78 ------- modules/nf-core/cat/cat/meta.yml | 46 ---- modules/nf-core/cat/cat/tests/main.nf.test | 191 ---------------- .../nf-core/cat/cat/tests/main.nf.test.snap | 147 ------------- .../cat/tests/nextflow_unzipped_zipped.config | 6 - .../cat/tests/nextflow_zipped_unzipped.config | 8 - .../nf-core/tabix/bgziptabix/environment.yml | 8 - modules/nf-core/tabix/bgziptabix/main.nf | 48 ---- modules/nf-core/tabix/bgziptabix/meta.yml | 74 ------- .../tabix/bgziptabix/tests/main.nf.test | 123 ----------- .../tabix/bgziptabix/tests/main.nf.test.snap | 206 ------------------ .../tabix/bgziptabix/tests/tabix_csi.config | 5 - .../tabix/bgziptabix/tests/tabix_tbi.config | 5 - tests/main.nf.test | 1 + 23 files changed, 4 insertions(+), 1312 deletions(-) delete mode 100755 bin/findHalfcoverage.py delete mode 100755 bin/get_avgcov.sh delete mode 100755 bin/graph_overall_coverage.pl delete mode 100755 bin/longread_cov_log.py delete mode 100644 modules/local/get/largest_scaffold/main.nf delete mode 100644 modules/local/graph/overall_coverage/main.nf delete mode 100644 modules/nf-core/cat/cat/environment.yml delete mode 100644 modules/nf-core/cat/cat/main.nf delete mode 100644 modules/nf-core/cat/cat/meta.yml delete mode 100644 modules/nf-core/cat/cat/tests/main.nf.test delete mode 100644 modules/nf-core/cat/cat/tests/main.nf.test.snap delete mode 100644 modules/nf-core/cat/cat/tests/nextflow_unzipped_zipped.config delete mode 100644 modules/nf-core/cat/cat/tests/nextflow_zipped_unzipped.config delete mode 100644 modules/nf-core/tabix/bgziptabix/environment.yml delete mode 100644 modules/nf-core/tabix/bgziptabix/main.nf delete mode 100644 modules/nf-core/tabix/bgziptabix/meta.yml delete mode 100644 modules/nf-core/tabix/bgziptabix/tests/main.nf.test delete mode 100644 modules/nf-core/tabix/bgziptabix/tests/main.nf.test.snap delete mode 100644 modules/nf-core/tabix/bgziptabix/tests/tabix_csi.config delete mode 100644 modules/nf-core/tabix/bgziptabix/tests/tabix_tbi.config diff --git a/.nf-core.yml b/.nf-core.yml index e3d2362b..2f446907 100644 --- a/.nf-core.yml +++ b/.nf-core.yml @@ -48,4 +48,4 @@ template: - seqera_platform - multiqc - rocrate - version: 1.4.2 + version: 1.5.0 diff --git a/CITATION.cff b/CITATION.cff index 9d72b971..0abe02de 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -30,6 +30,6 @@ identifiers: value: 10.5281/zenodo.12773958 repository-code: "https://github.com/sanger-tol/curationpretext" license: MIT -version: 1.4.2 -date-released: "2025-07-28" +version: 1.5.0 +date-released: "2025-08-04" url: "https://pipelines.tol.sanger.ac.uk/curationpretext" diff --git a/bin/findHalfcoverage.py b/bin/findHalfcoverage.py deleted file mode 100755 index f83fdcc5..00000000 --- a/bin/findHalfcoverage.py +++ /dev/null @@ -1,177 +0,0 @@ -#! /usr/bin/env python3 - -import re -import sys -from optparse import OptionParser - - -def load_scafsize(file): - # example is my.genome file, "scaffold\tsize" - - scafkey = {} - scaffile = open(file, "r") - for line in scaffile: - line = line.replace("\n", "") - name, size = re.split("\t", line) - scafkey[name] = size - - scaffile.close() - return scafkey - - -def getTotallength_undercov(file, cov, wiggleroom): - # example is bed file of coverage, - # scaffold_100_arrow 0 2 18 - - coverage_cutoff = cov + wiggleroom - - myfile = open(file, "r") - - lowcoverage_sum = 0 - prev_scaf = "" - scaf_lc = {} - - for line in myfile: - line = line.replace("\n", "") - objContents = re.split("\t", line) - - if prev_scaf != objContents[0]: - scaf_lc[prev_scaf] = lowcoverage_sum - lowcoverage_sum = 0 - - if float(objContents[3]) < coverage_cutoff: - length = float(objContents[2]) - float(objContents[1]) - lowcoverage_sum += length - - prev_scaf = objContents[0] - - scaf_lc[prev_scaf] = lowcoverage_sum - myfile.close() - - return scaf_lc - - -def get_cov_peaks(file): - # example is depthgraph.txt, "coverage\tbasepair count" - - myPeakFile = open(file, "r") - - rows = [] - for line in myPeakFile: - line = line.replace("\n", "") - items = re.split("\t", line) - rows.append(items) - - myPeakFile.close() - # print(rows[0]) - peakCov = sorted(rows, key=lambda cov: int(cov[1]), reverse=1)[0][0] - - if int(peakCov) == 0: - peakCov = sorted(rows, key=lambda cov: int(cov[1]), reverse=1)[1][0] - - halfPeak = int(peakCov) / 2 - qrtPeak = int(peakCov) / 4 - - print("#Coverage Peak is %s, HalfPeak is %s, QuarterPeak is %s " % (peakCov, halfPeak, qrtPeak)) - - return (peakCov, halfPeak, qrtPeak) - - -def calc_coverage(scafsize, totallowcov): - # calculate the % for lowcov coverage over entire scaffold. - return totallowcov / scafsize * 100 - - -def getArguments(): - # get indivudual arguments from user - - parser = OptionParser(version="%prog 1.0") - parser.add_option( - "-c", "--coveragefile", action="store", type="string", dest="covfile", help="Scaffold Coverage filename" - ) - parser.add_option( - "-m", "--mygenome", action="store", type="string", dest="mygenome", help="mygenome file, scaffold - size file" - ) - parser.add_option( - "-d", - "--depthgraph", - action="store", - type="string", - dest="depth", - help="depthgraph file, bp count at each depth", - ) - parser.add_option( - "-w", - "--wiggle", - action="store", - type="float", - dest="wig", - default=5, - help="wiggle room to add to depth cutoff ie 30X + wiggleroom. Default is 5X", - ) - parser.add_option( - "--cut", - action="store", - type="float", - dest="covcut", - default=60, - help="%Number for coverage cutoff to include in results. ie 50% of scaffold needs to be under diploid peak etc. Default is 60%", - ) - parser.add_option( - "-t", - "--totalsize", - action="store", - type="int", - dest="totsize", - default=250000, - help="total size that determines max coverage boundary.", - ) - - (options, args) = parser.parse_args() - - if options.covfile == None or options.mygenome == None or options.depth == None: - print("Missing Options") - exit() - - return options - - -def main(): - # main program - - options = getArguments() - - scaffold_sizes = load_scafsize(options.mygenome) - (hapCov, dipCov, tetCov) = get_cov_peaks(options.depth) - scaffold_lowcovsum = getTotallength_undercov(options.covfile, dipCov, options.wig) - - for scaffoldName in scaffold_lowcovsum: - if scaffoldName == "": - continue - - # print("==" + scaffoldName) - totalSize = float(scaffold_sizes[scaffoldName]) - lowcovSize = float(scaffold_lowcovsum[scaffoldName]) - - coverage = calc_coverage(totalSize, lowcovSize) - - if coverage > options.covcut: - if totalSize > options.totsize: - print( - "**\t" - + "\t".join( - [str(i) for i in [scaffoldName, int(totalSize), int(lowcovSize), "{:.1f}".format(coverage)]] - ) - ) - else: - print( - "==\t" - + "\t".join( - [str(i) for i in [scaffoldName, int(totalSize), int(lowcovSize), "{:.1f}".format(coverage)]] - ) - ) - - -# -- script execuation -- # -if __name__ == "__main__": - main() diff --git a/bin/get_avgcov.sh b/bin/get_avgcov.sh deleted file mode 100755 index 2eac5ca5..00000000 --- a/bin/get_avgcov.sh +++ /dev/null @@ -1,17 +0,0 @@ -#!/bin/bash - -# get_avgcov.sh -# ------------------- -# A shell script to calculate average coverage for each scaffold -# into bed format for use -# ------------------- -# Author = yy5 -# Modified = dp24 -# ------------------- -version='1.0.0' -if [ $1 == '-v' ]; -then - echo "$version" -else - awk '{OFS="\t"; $5=$4*($3-$2); print}' $1|awk '{OFS="\t"; sum[$1]+=$5} END {for (chrom in sum) print chrom, sum[chrom]}'|awk 'BEGIN {FS="\t"; OFS="\t"} NR==FNR {genome[$1]=$2; next} {if ($1 in genome) print $1, genome[$1], $2, $3; else print $1, "NA", $2, $3}' - $2| awk '{OFS="\t"; print $1,"0",$3,($2/$3)}' | awk 'BEGIN {FS="\t"; OFS="\t"} {printf "%s\t%s\t%s\t%.0f\n", $1, $2, $3, int($4 + 0.5)}'|sort -T $4 -k1,1 -k2,2n> $3 -fi diff --git a/bin/graph_overall_coverage.pl b/bin/graph_overall_coverage.pl deleted file mode 100755 index 174e61b7..00000000 --- a/bin/graph_overall_coverage.pl +++ /dev/null @@ -1,34 +0,0 @@ -#!/usr/bin/env perl - -# Script originally developed by Yumi Sims (yy5@sanger.ac.uk) - -use warnings; - -# my $file = shift; - -my ($file) = @ARGV; - -if (!@ARGV || ($ARGV[0] eq '--version')) { - print "1.0\n"; - exit 0; -} - -open (FILE, $file) || die "can't open file $file\n"; - -my %depthcount; -while (my $line = ) { - chomp $line; - my ($id, $start, $end, $depth) = split ("\t", $line); - my $length = $end - $start; - - if ($depthcount{$depth}){ - $depthcount{$depth} += $length; - } - else { - $depthcount{$depth} = $length; - } -} - -foreach my $depth (sort {$a<=>$b} keys %depthcount){ - print join("\t", $depth, $depthcount{$depth}) ."\n"; -} diff --git a/bin/longread_cov_log.py b/bin/longread_cov_log.py deleted file mode 100755 index d5cc177c..00000000 --- a/bin/longread_cov_log.py +++ /dev/null @@ -1,43 +0,0 @@ -#!/usr/bin/env python - -import optparse -import math - -# Script originally developed by Will Eagles (we3@sanger.ac.uk) - - -def process_line(line): - line_values = line.rsplit(None, 1) - - try: - cov_val = float(line_values[1]) - except: - cov_val = 0 - - if cov_val > 0: - log_cov_val = math.log(cov_val) - else: - log_cov_val = 0 - - return line_values[0] + "\t" + str(round(log_cov_val, 2)) - - -def main(): - parser = optparse.OptionParser(version="%prog 1.0") - parser.add_option( - "-i", - "--inputfile", - dest="inputfile", - default="default.input", - ) - - options, remainder = parser.parse_args() - - cov_bed = open(options.inputfile, "r") - - for line in cov_bed: - print(process_line(line)) - - -if __name__ == "__main__": - main() diff --git a/modules/local/get/largest_scaffold/main.nf b/modules/local/get/largest_scaffold/main.nf deleted file mode 100644 index a496a800..00000000 --- a/modules/local/get/largest_scaffold/main.nf +++ /dev/null @@ -1,43 +0,0 @@ -process GET_LARGEST_SCAFFOLD { - - tag "$meta.id" - label 'process_low' - - conda "conda-forge::coreutils=9.1" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : - 'docker.io/ubuntu:20.04' }" - - input: - tuple val( meta ), path( file ) - - output: - env largest_scaff, emit: scaff_size - path "versions.yml", emit: versions - - script: - def LARGEST_SCAFF_VERSION = "2.0" - def VERSION = "9.1" // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions. - """ - largest_scaff=\$(head -n 1 "${file}" | cut -d\$'\t' -f2) - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - get_largest_scaffold: $LARGEST_SCAFF_VERSION - coreutils: $VERSION - END_VERSIONS - """ - - stub: - def LARGEST_SCAFF_VERSION = "2.0" - def VERSION = "9.1" // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions. - """ - largest_scaff=1000000 - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - get_largest_scaff: $LARGEST_SCAFF_VERSION - coreutils: $VERSION - END_VERSIONS - """ -} diff --git a/modules/local/graph/overall_coverage/main.nf b/modules/local/graph/overall_coverage/main.nf deleted file mode 100644 index 87892813..00000000 --- a/modules/local/graph/overall_coverage/main.nf +++ /dev/null @@ -1,43 +0,0 @@ -process GRAPH_OVERALL_COVERAGE { - tag "$meta.id" - label 'process_single' - - conda "conda-forge::perl=5.26.2" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/perl:5.26.2' : - 'biocontainers/perl:5.26.2' }" - - input: - tuple val(meta), path(bed) - - output: - tuple val(meta), path("*.part") , emit: part - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - """ - graph_overall_coverage.pl $bed > ${prefix}.part - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - perl: \$(echo \$(perl --version 2>&1) | awk '/This/ {print \$9}')) - graph_overall_coverage.pl: \$(graph_overall_coverage.pl --version) - END_VERSIONS - """ - - stub: - def prefix = task.ext.prefix ?: "${meta.id}" - """ - touch ${prefix}.part - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - perl: \$(echo \$(perl --version 2>&1) | awk '/This/ {print \$9}')) graph_overall_coverage.pl: \$(graph_overall_coverage.pl --version) - END_VERSIONS - """ -} diff --git a/modules/nf-core/cat/cat/environment.yml b/modules/nf-core/cat/cat/environment.yml deleted file mode 100644 index 50c2059a..00000000 --- a/modules/nf-core/cat/cat/environment.yml +++ /dev/null @@ -1,7 +0,0 @@ ---- -# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json -channels: - - conda-forge - - bioconda -dependencies: - - conda-forge::pigz=2.3.4 diff --git a/modules/nf-core/cat/cat/main.nf b/modules/nf-core/cat/cat/main.nf deleted file mode 100644 index 2862c64c..00000000 --- a/modules/nf-core/cat/cat/main.nf +++ /dev/null @@ -1,78 +0,0 @@ -process CAT_CAT { - tag "$meta.id" - label 'process_low' - - conda "${moduleDir}/environment.yml" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/pigz:2.3.4' : - 'biocontainers/pigz:2.3.4' }" - - input: - tuple val(meta), path(files_in) - - output: - tuple val(meta), path("${prefix}"), emit: file_out - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - def args2 = task.ext.args2 ?: '' - def file_list = files_in.collect { it.toString() } - - // choose appropriate concatenation tool depending on input and output format - - // | input | output | command1 | command2 | - // |-----------|------------|----------|----------| - // | gzipped | gzipped | cat | | - // | ungzipped | ungzipped | cat | | - // | gzipped | ungzipped | zcat | | - // | ungzipped | gzipped | cat | pigz | - - // Use input file ending as default - prefix = task.ext.prefix ?: "${meta.id}${getFileSuffix(file_list[0])}" - out_zip = prefix.endsWith('.gz') - in_zip = file_list[0].endsWith('.gz') - command1 = (in_zip && !out_zip) ? 'zcat' : 'cat' - command2 = (!in_zip && out_zip) ? "| pigz -c -p $task.cpus $args2" : '' - if(file_list.contains(prefix.trim())) { - error "The name of the input file can't be the same as for the output prefix in the " + - "module CAT_CAT (currently `$prefix`). Please choose a different one." - } - """ - $command1 \\ - $args \\ - ${file_list.join(' ')} \\ - $command2 \\ - > ${prefix} - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - pigz: \$( pigz --version 2>&1 | sed 's/pigz //g' ) - END_VERSIONS - """ - - stub: - def file_list = files_in.collect { it.toString() } - prefix = task.ext.prefix ?: "${meta.id}${file_list[0].substring(file_list[0].lastIndexOf('.'))}" - if(file_list.contains(prefix.trim())) { - error "The name of the input file can't be the same as for the output prefix in the " + - "module CAT_CAT (currently `$prefix`). Please choose a different one." - } - """ - touch $prefix - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - pigz: \$( pigz --version 2>&1 | sed 's/pigz //g' ) - END_VERSIONS - """ -} - -// for .gz files also include the second to last extension if it is present. E.g., .fasta.gz -def getFileSuffix(filename) { - def match = filename =~ /^.*?((\.\w{1,5})?(\.\w{1,5}\.gz$))/ - return match ? match[0][1] : filename.substring(filename.lastIndexOf('.')) -} diff --git a/modules/nf-core/cat/cat/meta.yml b/modules/nf-core/cat/cat/meta.yml deleted file mode 100644 index 2a9284d7..00000000 --- a/modules/nf-core/cat/cat/meta.yml +++ /dev/null @@ -1,46 +0,0 @@ -name: cat_cat -description: A module for concatenation of gzipped or uncompressed files -keywords: - - concatenate - - gzip - - cat -tools: - - cat: - description: Just concatenation - documentation: https://man7.org/linux/man-pages/man1/cat.1.html - licence: ["GPL-3.0-or-later"] - identifier: "" -input: - - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - files_in: - type: file - description: List of compressed / uncompressed files - pattern: "*" - ontologies: [] -output: - file_out: - - - meta: - type: map - description: Groovy Map containing sample information - - ${prefix}: - type: file - description: Concatenated file. Will be gzipped if file_out ends with ".gz" - pattern: "${file_out}" - ontologies: [] - versions: - - versions.yml: - type: file - description: File containing software versions - pattern: "versions.yml" - ontologies: - - edam: http://edamontology.org/format_3750 # YAML -authors: - - "@erikrikarddaniel" - - "@FriederikeHanssen" -maintainers: - - "@erikrikarddaniel" - - "@FriederikeHanssen" diff --git a/modules/nf-core/cat/cat/tests/main.nf.test b/modules/nf-core/cat/cat/tests/main.nf.test deleted file mode 100644 index 9cb16178..00000000 --- a/modules/nf-core/cat/cat/tests/main.nf.test +++ /dev/null @@ -1,191 +0,0 @@ -nextflow_process { - - name "Test Process CAT_CAT" - script "../main.nf" - process "CAT_CAT" - tag "modules" - tag "modules_nfcore" - tag "cat" - tag "cat/cat" - - test("test_cat_name_conflict") { - when { - params { - outdir = "${outputDir}" - } - process { - """ - input[0] = - [ - [ id:'genome', single_end:true ], - [ - file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true), - file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.sizes', checkIfExists: true) - ] - ] - """ - } - } - then { - assertAll( - { assert !process.success }, - { assert process.stdout.toString().contains("The name of the input file can't be the same as for the output prefix") }, - { assert snapshot(process.out.versions).match() } - ) - } - } - - test("test_cat_unzipped_unzipped") { - when { - params { - outdir = "${outputDir}" - } - process { - """ - input[0] = - [ - [ id:'test', single_end:true ], - [ - file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true), - file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.sizes', checkIfExists: true) - ] - ] - """ - } - } - then { - assertAll( - { assert process.success }, - { assert snapshot(process.out).match() } - ) - } - } - - - test("test_cat_zipped_zipped") { - when { - params { - outdir = "${outputDir}" - } - process { - """ - input[0] = - [ - [ id:'test', single_end:true ], - [ - file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.gff3.gz', checkIfExists: true), - file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/alignment/last/contigs.genome.maf.gz', checkIfExists: true) - ] - ] - """ - } - } - then { - def lines = path(process.out.file_out.get(0).get(1)).linesGzip - assertAll( - { assert process.success }, - { assert snapshot( - lines[0..5], - lines.size(), - process.out.versions - ).match() - } - ) - } - } - - test("test_cat_zipped_unzipped") { - config './nextflow_zipped_unzipped.config' - - when { - params { - outdir = "${outputDir}" - } - process { - """ - input[0] = - [ - [ id:'test', single_end:true ], - [ - file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.gff3.gz', checkIfExists: true), - file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/alignment/last/contigs.genome.maf.gz', checkIfExists: true) - ] - ] - """ - } - } - - then { - assertAll( - { assert process.success }, - { assert snapshot(process.out).match() } - ) - } - - } - - test("test_cat_unzipped_zipped") { - config './nextflow_unzipped_zipped.config' - when { - params { - outdir = "${outputDir}" - } - process { - """ - input[0] = - [ - [ id:'test', single_end:true ], - [ - file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true), - file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.sizes', checkIfExists: true) - ] - ] - """ - } - } - then { - def lines = path(process.out.file_out.get(0).get(1)).linesGzip - assertAll( - { assert process.success }, - { assert snapshot( - lines[0..5], - lines.size(), - process.out.versions - ).match() - } - ) - } - } - - test("test_cat_one_file_unzipped_zipped") { - config './nextflow_unzipped_zipped.config' - when { - params { - outdir = "${outputDir}" - } - process { - """ - input[0] = - [ - [ id:'test', single_end:true ], - [ - file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) - ] - ] - """ - } - } - then { - def lines = path(process.out.file_out.get(0).get(1)).linesGzip - assertAll( - { assert process.success }, - { assert snapshot( - lines[0..5], - lines.size(), - process.out.versions - ).match() - } - ) - } - } -} diff --git a/modules/nf-core/cat/cat/tests/main.nf.test.snap b/modules/nf-core/cat/cat/tests/main.nf.test.snap deleted file mode 100644 index b7623ee6..00000000 --- a/modules/nf-core/cat/cat/tests/main.nf.test.snap +++ /dev/null @@ -1,147 +0,0 @@ -{ - "test_cat_unzipped_unzipped": { - "content": [ - { - "0": [ - [ - { - "id": "test", - "single_end": true - }, - "test.fasta:md5,f44b33a0e441ad58b2d3700270e2dbe2" - ] - ], - "1": [ - "versions.yml:md5,115ed6177ebcff24eb99d503fa5ef894" - ], - "file_out": [ - [ - { - "id": "test", - "single_end": true - }, - "test.fasta:md5,f44b33a0e441ad58b2d3700270e2dbe2" - ] - ], - "versions": [ - "versions.yml:md5,115ed6177ebcff24eb99d503fa5ef894" - ] - } - ], - "meta": { - "nf-test": "0.8.4", - "nextflow": "24.04.3" - }, - "timestamp": "2023-10-16T14:32:18.500464399" - }, - "test_cat_zipped_unzipped": { - "content": [ - { - "0": [ - [ - { - "id": "test", - "single_end": true - }, - "cat.txt:md5,c439d3b60e7bc03e8802a451a0d9a5d9" - ] - ], - "1": [ - "versions.yml:md5,115ed6177ebcff24eb99d503fa5ef894" - ], - "file_out": [ - [ - { - "id": "test", - "single_end": true - }, - "cat.txt:md5,c439d3b60e7bc03e8802a451a0d9a5d9" - ] - ], - "versions": [ - "versions.yml:md5,115ed6177ebcff24eb99d503fa5ef894" - ] - } - ], - "meta": { - "nf-test": "0.8.4", - "nextflow": "24.04.3" - }, - "timestamp": "2023-10-16T14:32:49.642741302" - }, - "test_cat_zipped_zipped": { - "content": [ - [ - "MT192765.1\tGenbank\ttranscript\t259\t29667\t.\t+\t.\tID=unknown_transcript_1;geneID=orf1ab;gene_name=orf1ab", - "MT192765.1\tGenbank\tgene\t259\t21548\t.\t+\t.\tParent=unknown_transcript_1", - "MT192765.1\tGenbank\tCDS\t259\t13461\t.\t+\t0\tParent=unknown_transcript_1;exception=\"ribosomal slippage\";gbkey=CDS;gene=orf1ab;note=\"pp1ab;translated=by -1 ribosomal frameshift\";product=\"orf1ab polyprotein\";protein_id=QIK50426.1", - "MT192765.1\tGenbank\tCDS\t13461\t21548\t.\t+\t0\tParent=unknown_transcript_1;exception=\"ribosomal slippage\";gbkey=CDS;gene=orf1ab;note=\"pp1ab;translated=by -1 ribosomal frameshift\";product=\"orf1ab polyprotein\";protein_id=QIK50426.1", - "MT192765.1\tGenbank\tCDS\t21556\t25377\t.\t+\t0\tParent=unknown_transcript_1;gbkey=CDS;gene=S;note=\"structural protein\";product=\"surface glycoprotein\";protein_id=QIK50427.1", - "MT192765.1\tGenbank\tgene\t21556\t25377\t.\t+\t.\tParent=unknown_transcript_1" - ], - 78, - [ - "versions.yml:md5,115ed6177ebcff24eb99d503fa5ef894" - ] - ], - "meta": { - "nf-test": "0.8.4", - "nextflow": "24.04.3" - }, - "timestamp": "2024-07-22T11:51:46.802978" - }, - "test_cat_name_conflict": { - "content": [ - [ - - ] - ], - "meta": { - "nf-test": "0.8.4", - "nextflow": "24.04.3" - }, - "timestamp": "2024-07-22T11:51:29.45394" - }, - "test_cat_one_file_unzipped_zipped": { - "content": [ - [ - ">MT192765.1 Severe acute respiratory syndrome coronavirus 2 isolate SARS-CoV-2/human/USA/PC00101P/2020, complete genome", - "GTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGTAGATCTGTTCTCTAAACGAACTTTAAAATCTGT", - "GTGGCTGTCACTCGGCTGCATGCTTAGTGCACTCACGCAGTATAATTAATAACTAATTACTGTCGTTGACAGGACACGAG", - "TAACTCGTCTATCTTCTGCAGGCTGCTTACGGTTTCGTCCGTGTTGCAGCCGATCATCAGCACATCTAGGTTTTGTCCGG", - "GTGTGACCGAAAGGTAAGATGGAGAGCCTTGTCCCTGGTTTCAACGAGAAAACACACGTCCAACTCAGTTTGCCTGTTTT", - "ACAGGTTCGCGACGTGCTCGTACGTGGCTTTGGAGACTCCGTGGAGGAGGTCTTATCAGAGGCACGTCAACATCTTAAAG" - ], - 374, - [ - "versions.yml:md5,115ed6177ebcff24eb99d503fa5ef894" - ] - ], - "meta": { - "nf-test": "0.8.4", - "nextflow": "24.04.3" - }, - "timestamp": "2024-07-22T11:52:02.774016" - }, - "test_cat_unzipped_zipped": { - "content": [ - [ - ">MT192765.1 Severe acute respiratory syndrome coronavirus 2 isolate SARS-CoV-2/human/USA/PC00101P/2020, complete genome", - "GTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGTAGATCTGTTCTCTAAACGAACTTTAAAATCTGT", - "GTGGCTGTCACTCGGCTGCATGCTTAGTGCACTCACGCAGTATAATTAATAACTAATTACTGTCGTTGACAGGACACGAG", - "TAACTCGTCTATCTTCTGCAGGCTGCTTACGGTTTCGTCCGTGTTGCAGCCGATCATCAGCACATCTAGGTTTTGTCCGG", - "GTGTGACCGAAAGGTAAGATGGAGAGCCTTGTCCCTGGTTTCAACGAGAAAACACACGTCCAACTCAGTTTGCCTGTTTT", - "ACAGGTTCGCGACGTGCTCGTACGTGGCTTTGGAGACTCCGTGGAGGAGGTCTTATCAGAGGCACGTCAACATCTTAAAG" - ], - 375, - [ - "versions.yml:md5,115ed6177ebcff24eb99d503fa5ef894" - ] - ], - "meta": { - "nf-test": "0.8.4", - "nextflow": "24.04.3" - }, - "timestamp": "2024-07-22T11:51:57.581523" - } -} \ No newline at end of file diff --git a/modules/nf-core/cat/cat/tests/nextflow_unzipped_zipped.config b/modules/nf-core/cat/cat/tests/nextflow_unzipped_zipped.config deleted file mode 100644 index ec26b0fd..00000000 --- a/modules/nf-core/cat/cat/tests/nextflow_unzipped_zipped.config +++ /dev/null @@ -1,6 +0,0 @@ - -process { - withName: CAT_CAT { - ext.prefix = 'cat.txt.gz' - } -} diff --git a/modules/nf-core/cat/cat/tests/nextflow_zipped_unzipped.config b/modules/nf-core/cat/cat/tests/nextflow_zipped_unzipped.config deleted file mode 100644 index fbc79783..00000000 --- a/modules/nf-core/cat/cat/tests/nextflow_zipped_unzipped.config +++ /dev/null @@ -1,8 +0,0 @@ - -process { - - withName: CAT_CAT { - ext.prefix = 'cat.txt' - } - -} diff --git a/modules/nf-core/tabix/bgziptabix/environment.yml b/modules/nf-core/tabix/bgziptabix/environment.yml deleted file mode 100644 index 771b1387..00000000 --- a/modules/nf-core/tabix/bgziptabix/environment.yml +++ /dev/null @@ -1,8 +0,0 @@ ---- -# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json -channels: - - conda-forge - - bioconda - -dependencies: - - bioconda::htslib=1.21 diff --git a/modules/nf-core/tabix/bgziptabix/main.nf b/modules/nf-core/tabix/bgziptabix/main.nf deleted file mode 100644 index f295c7f2..00000000 --- a/modules/nf-core/tabix/bgziptabix/main.nf +++ /dev/null @@ -1,48 +0,0 @@ -process TABIX_BGZIPTABIX { - tag "$meta.id" - label 'process_single' - - conda "${moduleDir}/environment.yml" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/92/92859404d861ae01afb87e2b789aebc71c0ab546397af890c7df74e4ee22c8dd/data' : - 'community.wave.seqera.io/library/htslib:1.21--ff8e28a189fbecaa' }" - - input: - tuple val(meta), path(input) - - output: - tuple val(meta), path("*.gz"), path("*.tbi"), optional: true, emit: gz_tbi - tuple val(meta), path("*.gz"), path("*.csi"), optional: true, emit: gz_csi - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - def args2 = task.ext.args2 ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - """ - bgzip --threads ${task.cpus} -c $args $input > ${prefix}.${input.getExtension()}.gz - tabix --threads ${task.cpus} $args2 ${prefix}.${input.getExtension()}.gz - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - tabix: \$(echo \$(tabix -h 2>&1) | sed 's/^.*Version: //; s/ .*\$//') - END_VERSIONS - """ - - stub: - def prefix = task.ext.prefix ?: "${meta.id}" - def args2 = task.ext.args2 ?: '' - def index = args2.contains("-C ") || args2.contains("--csi") ? "csi" : "tbi" - """ - echo "" | gzip > ${prefix}.${input.getExtension()}.gz - touch ${prefix}.${input.getExtension()}.gz.${index} - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - tabix: \$(echo \$(tabix -h 2>&1) | sed 's/^.*Version: //; s/ .*\$//') - END_VERSIONS - """ -} diff --git a/modules/nf-core/tabix/bgziptabix/meta.yml b/modules/nf-core/tabix/bgziptabix/meta.yml deleted file mode 100644 index 9c2c46d1..00000000 --- a/modules/nf-core/tabix/bgziptabix/meta.yml +++ /dev/null @@ -1,74 +0,0 @@ -name: tabix_bgziptabix -description: bgzip a sorted tab-delimited genome file and then create tabix index -keywords: - - bgzip - - compress - - index - - tabix - - vcf -tools: - - tabix: - description: Generic indexer for TAB-delimited genome position files. - homepage: https://www.htslib.org/doc/tabix.html - documentation: https://www.htslib.org/doc/tabix.1.html - doi: 10.1093/bioinformatics/btq671 - licence: ["MIT"] - identifier: biotools:tabix -input: - - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - input: - type: file - description: Sorted tab-delimited genome file - ontologies: [] -output: - gz_tbi: - - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - "*.gz": - type: file - description: bgzipped tab-delimited genome file - pattern: "*.gz" - ontologies: - - edam: http://edamontology.org/format_3989 # GZIP format - - "*.tbi": - type: file - description: tabix index file - pattern: "*.tbi" - ontologies: [] - gz_csi: - - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - "*.gz": - type: file - description: bgzipped tab-delimited genome file - pattern: "*.gz" - ontologies: - - edam: http://edamontology.org/format_3989 # GZIP format - - "*.csi": - type: file - description: csi index file - pattern: "*.csi" - ontologies: [] - versions: - - versions.yml: - type: file - description: File containing software versions - pattern: "versions.yml" - ontologies: - - edam: http://edamontology.org/format_3750 # YAML -authors: - - "@maxulysse" - - "@DLBPointon" -maintainers: - - "@maxulysse" - - "@DLBPointon" diff --git a/modules/nf-core/tabix/bgziptabix/tests/main.nf.test b/modules/nf-core/tabix/bgziptabix/tests/main.nf.test deleted file mode 100644 index cdb016e5..00000000 --- a/modules/nf-core/tabix/bgziptabix/tests/main.nf.test +++ /dev/null @@ -1,123 +0,0 @@ -nextflow_process { - - name "Test Process TABIX_BGZIPTABIX" - script "../main.nf" - process "TABIX_BGZIPTABIX" - - tag "modules" - tag "modules_nfcore" - tag "tabix" - tag "tabix/bgziptabix" - - test("sarscov2_bed_tbi") { - config "./tabix_tbi.config" - - when { - process { - """ - input[0] = [ - [ id:'tbi_test' ], - [ file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/bed/test.bed', checkIfExists: true) ] - ] - """ - } - } - - then { - assertAll ( - { assert process.success }, - { assert snapshot(process.out).match() }, - { assert snapshot( - file(process.out.gz_tbi[0][1]).name - ).match("tbi_test") - } - ) - } - } - - test("sarscov2_bed_csi") { - config "./tabix_csi.config" - - when { - process { - """ - input[0] = [ - [ id:'csi_test' ], - [ file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/bed/test.bed', checkIfExists: true) ] - ] - """ - } - } - - then { - assertAll ( - { assert process.success }, - { assert snapshot(process.out).match() }, - { assert snapshot( - file(process.out.gz_csi[0][1]).name - ).match("csi_test") - } - ) - } - - } - - test("sarscov2_bed_csi_stub") { - config "./tabix_csi.config" - - options "-stub" - - when { - process { - """ - input[0] = [ - [ id:'test' ], - [ file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/bed/test.bed', checkIfExists: true) ] - ] - """ - } - } - - then { - assertAll ( - { assert process.success }, - { assert snapshot(process.out).match() }, - { assert snapshot( - file(process.out.gz_csi[0][1]).name - ).match("csi_stub") - } - ) - } - - } - - test("sarscov2_bed_tbi_stub") { - config "./tabix_tbi.config" - - options "-stub" - - when { - process { - """ - input[0] = [ - [ id:'test' ], - [ file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/bed/test.bed', checkIfExists: true) ] - ] - """ - } - } - - then { - assertAll ( - { assert process.success }, - { assert snapshot(process.out).match() }, - { assert snapshot( - file(process.out.gz_tbi[0][1]).name - ).match("tbi_stub") - } - ) - } - - } - -} diff --git a/modules/nf-core/tabix/bgziptabix/tests/main.nf.test.snap b/modules/nf-core/tabix/bgziptabix/tests/main.nf.test.snap deleted file mode 100644 index 5f818045..00000000 --- a/modules/nf-core/tabix/bgziptabix/tests/main.nf.test.snap +++ /dev/null @@ -1,206 +0,0 @@ -{ - "sarscov2_bed_tbi": { - "content": [ - { - "0": [ - [ - { - "id": "tbi_test" - }, - "tbi_test.bed.gz:md5,fe4053cf4de3aebbdfc3be2efb125a74", - "tbi_test.bed.gz.tbi:md5,ca06caf88b1e3c67d5fcba0a1460b52c" - ] - ], - "1": [ - - ], - "2": [ - "versions.yml:md5,9a7904908d7400fc67ef0412a925e9fc" - ], - "gz_csi": [ - - ], - "gz_tbi": [ - [ - { - "id": "tbi_test" - }, - "tbi_test.bed.gz:md5,fe4053cf4de3aebbdfc3be2efb125a74", - "tbi_test.bed.gz.tbi:md5,ca06caf88b1e3c67d5fcba0a1460b52c" - ] - ], - "versions": [ - "versions.yml:md5,9a7904908d7400fc67ef0412a925e9fc" - ] - } - ], - "meta": { - "nf-test": "0.9.2", - "nextflow": "24.10.5" - }, - "timestamp": "2025-03-26T13:52:30.53305451" - }, - "sarscov2_bed_csi": { - "content": [ - { - "0": [ - - ], - "1": [ - [ - { - "id": "csi_test" - }, - "csi_test.bed.gz:md5,fe4053cf4de3aebbdfc3be2efb125a74", - "csi_test.bed.gz.csi:md5,c9c0377de58fdc89672bb3005a0d69f5" - ] - ], - "2": [ - "versions.yml:md5,9a7904908d7400fc67ef0412a925e9fc" - ], - "gz_csi": [ - [ - { - "id": "csi_test" - }, - "csi_test.bed.gz:md5,fe4053cf4de3aebbdfc3be2efb125a74", - "csi_test.bed.gz.csi:md5,c9c0377de58fdc89672bb3005a0d69f5" - ] - ], - "gz_tbi": [ - - ], - "versions": [ - "versions.yml:md5,9a7904908d7400fc67ef0412a925e9fc" - ] - } - ], - "meta": { - "nf-test": "0.9.2", - "nextflow": "24.10.5" - }, - "timestamp": "2025-03-26T13:52:34.152301569" - }, - "csi_test": { - "content": [ - "csi_test.bed.gz" - ], - "meta": { - "nf-test": "0.8.4", - "nextflow": "24.04.2" - }, - "timestamp": "2024-02-19T14:51:00.548801" - }, - "sarscov2_bed_tbi_stub": { - "content": [ - { - "0": [ - [ - { - "id": "test" - }, - "test.bed.gz:md5,68b329da9893e34099c7d8ad5cb9c940", - "test.bed.gz.tbi:md5,d41d8cd98f00b204e9800998ecf8427e" - ] - ], - "1": [ - - ], - "2": [ - "versions.yml:md5,9a7904908d7400fc67ef0412a925e9fc" - ], - "gz_csi": [ - - ], - "gz_tbi": [ - [ - { - "id": "test" - }, - "test.bed.gz:md5,68b329da9893e34099c7d8ad5cb9c940", - "test.bed.gz.tbi:md5,d41d8cd98f00b204e9800998ecf8427e" - ] - ], - "versions": [ - "versions.yml:md5,9a7904908d7400fc67ef0412a925e9fc" - ] - } - ], - "meta": { - "nf-test": "0.9.2", - "nextflow": "24.10.5" - }, - "timestamp": "2025-03-26T13:52:41.271812789" - }, - "csi_stub": { - "content": [ - "test.bed.gz" - ], - "meta": { - "nf-test": "0.8.4", - "nextflow": "24.04.2" - }, - "timestamp": "2024-02-19T14:51:09.218454" - }, - "tbi_stub": { - "content": [ - "test.bed.gz" - ], - "meta": { - "nf-test": "0.9.0", - "nextflow": "24.04.4" - }, - "timestamp": "2024-09-25T14:45:18.550930179" - }, - "tbi_test": { - "content": [ - "tbi_test.bed.gz" - ], - "meta": { - "nf-test": "0.8.4", - "nextflow": "24.04.2" - }, - "timestamp": "2024-02-19T14:50:51.579654" - }, - "sarscov2_bed_csi_stub": { - "content": [ - { - "0": [ - - ], - "1": [ - [ - { - "id": "test" - }, - "test.bed.gz:md5,68b329da9893e34099c7d8ad5cb9c940", - "test.bed.gz.csi:md5,d41d8cd98f00b204e9800998ecf8427e" - ] - ], - "2": [ - "versions.yml:md5,9a7904908d7400fc67ef0412a925e9fc" - ], - "gz_csi": [ - [ - { - "id": "test" - }, - "test.bed.gz:md5,68b329da9893e34099c7d8ad5cb9c940", - "test.bed.gz.csi:md5,d41d8cd98f00b204e9800998ecf8427e" - ] - ], - "gz_tbi": [ - - ], - "versions": [ - "versions.yml:md5,9a7904908d7400fc67ef0412a925e9fc" - ] - } - ], - "meta": { - "nf-test": "0.9.2", - "nextflow": "24.10.5" - }, - "timestamp": "2025-03-26T13:52:37.709221651" - } -} \ No newline at end of file diff --git a/modules/nf-core/tabix/bgziptabix/tests/tabix_csi.config b/modules/nf-core/tabix/bgziptabix/tests/tabix_csi.config deleted file mode 100644 index fb41a314..00000000 --- a/modules/nf-core/tabix/bgziptabix/tests/tabix_csi.config +++ /dev/null @@ -1,5 +0,0 @@ -process { - withName: TABIX_BGZIPTABIX { - ext.args2 = '-p vcf --csi' - } -} diff --git a/modules/nf-core/tabix/bgziptabix/tests/tabix_tbi.config b/modules/nf-core/tabix/bgziptabix/tests/tabix_tbi.config deleted file mode 100644 index c1915dc4..00000000 --- a/modules/nf-core/tabix/bgziptabix/tests/tabix_tbi.config +++ /dev/null @@ -1,5 +0,0 @@ -process { - withName: TABIX_BGZIPTABIX { - ext.args2 = '-p vcf' - } -} \ No newline at end of file diff --git a/tests/main.nf.test b/tests/main.nf.test index a39a3713..269608a5 100644 --- a/tests/main.nf.test +++ b/tests/main.nf.test @@ -13,6 +13,7 @@ nextflow_pipeline { outdir = "${outputDir}" all_output = true skip_tracks = "NONE" + split_telomere = true } } From 13bd9bb0f9987b9c3af8f3caa9410606ba960b74 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Tue, 5 Aug 2025 12:06:25 +0100 Subject: [PATCH 16/58] Updates --- conf/modules.config | 15 ++++++++++++--- tests/main.nf.test.snap | 13 +++++-------- 2 files changed, 17 insertions(+), 11 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 02b8162f..f867ecc2 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -17,9 +17,18 @@ process { // withName: 'PRETEXT_INGEST_SNDRD|PRETEXT_INGEST_HIRES' { publishDir = [ - path: { "${params.outdir}/pretext_maps_processed" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + [ + path: { "${params.outdir}/pretext_maps_processed" }, + pattern: "*normal.pretext", + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + ], + [ + path: { "${params.outdir}/pretext_maps_processed" }, + pattern: "*hr.pretext", + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + ], ] } diff --git a/tests/main.nf.test.snap b/tests/main.nf.test.snap index 32259ee9..5cac579d 100644 --- a/tests/main.nf.test.snap +++ b/tests/main.nf.test.snap @@ -135,15 +135,12 @@ "pipeline_info/sanger-tol_curationpretext_software_versions.yml", "pretext_maps_processed", "pretext_maps_processed/CurationPretextTest_normal.pretext", - "pretext_maps_processed/telo_0.pretext", - "pretext_maps_processed/telo_1.pretext", - "pretext_maps_processed/telo_2.pretext", "pretext_maps_raw", "pretext_maps_raw/CurationPretextTest_normal_pi.pretext", "pretext_snapshot", "pretext_snapshot/CurationPretextTest_normalFullMap.png" ], - 21, + 18, [ "CurationPretextTest.bigWig:md5,3f66a9152d793a62f877b733c2336dfd", "CurationPretextTest.gap.bedgraph:md5,d41d8cd98f00b204e9800998ecf8427e", @@ -159,7 +156,7 @@ 1, false, true, - 4, + 1, false, true, 1, @@ -167,8 +164,8 @@ ], "meta": { "nf-test": "0.9.2", - "nextflow": "25.04.1" + "nextflow": "24.04.4" }, - "timestamp": "2025-08-04T17:47:27.212054464" + "timestamp": "2025-08-05T10:25:42.179879" } -} \ No newline at end of file +} From ed7c32322250fd0bda9bd1020dfdb7800379bdf8 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Thu, 21 Aug 2025 16:31:58 +0100 Subject: [PATCH 17/58] Update tests --- tests/main.nf.test | 20 +++++++++----------- tests/main.nf.test.snap | 6 +++--- 2 files changed, 12 insertions(+), 14 deletions(-) diff --git a/tests/main.nf.test b/tests/main.nf.test index 269608a5..bede8ee8 100644 --- a/tests/main.nf.test +++ b/tests/main.nf.test @@ -31,25 +31,23 @@ nextflow_pipeline { assertAll( {assert workflow.success}, {assert snapshot( - // Test for number of successful processes - should be 29 for a full run - workflow.trace.succeeded().size(), + // Test for number of successful processes + workflow.trace.succeeded().size(), // 42 with out needing to gunzip the assembly removeNextflowVersion("$outputDir/pipeline_info/sanger-tol_curationpretext_software_versions.yml"), // Stable name with relative path stable_name, - stable_name.size(), + stable_name.size(), // 18 // Accessory files accessories, - accessories.size(), - - // The two pretext files - // Presence of files indicated presence of the raw_pretexts - // we expect this to be a list of two files - // we can't use their md5sum as they will be different everytime - // Then double check that there are two - // one is a hr and the other a normal variant + accessories.size(), // 9 + + // The pretext files + // We only expect 1 pretext file as when using the `test` profile + // we are skipping hr pretext file generation + // so size will be 1 and presence of hr file is false pretext_maps_raw.size(), pretext_maps_raw.any{it.toString().contains("_hr_pi.pretext".toString())}, pretext_maps_raw.any{it.toString().contains("_normal_pi.pretext".toString())}, diff --git a/tests/main.nf.test.snap b/tests/main.nf.test.snap index 5cac579d..4b352b5c 100644 --- a/tests/main.nf.test.snap +++ b/tests/main.nf.test.snap @@ -164,8 +164,8 @@ ], "meta": { "nf-test": "0.9.2", - "nextflow": "24.04.4" + "nextflow": "25.04.6" }, - "timestamp": "2025-08-05T10:25:42.179879" + "timestamp": "2025-08-21T16:06:51.036682" } -} +} \ No newline at end of file From 656d9fcdd7f954f7c6412ce362812cd50a34c01d Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Thu, 21 Aug 2025 16:32:46 +0100 Subject: [PATCH 18/58] Add GUNZIP --- workflows/curationpretext.nf | 31 +++++++++++++++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/workflows/curationpretext.nf b/workflows/curationpretext.nf index 6e63803d..69ef0a1d 100644 --- a/workflows/curationpretext.nf +++ b/workflows/curationpretext.nf @@ -4,8 +4,9 @@ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -include { SAMTOOLS_FAIDX } from '../modules/nf-core/samtools/faidx/main' include { GAWK as GAWK_UPPER_SEQUENCE } from '../modules/nf-core/gawk/main' +include { SAMTOOLS_FAIDX } from '../modules/nf-core/samtools/faidx/main' +include { GUNZIP } from '../modules/nf-core/gunzip/main' include { PRETEXT_GRAPH as PRETEXT_INGEST_SNDRD } from '../modules/local/pretext/graph/main' include { PRETEXT_GRAPH as PRETEXT_INGEST_HIRES } from '../modules/local/pretext/graph/main' @@ -35,11 +36,37 @@ workflow CURATIONPRETEXT { ch_empty_file = Channel.fromPath("${baseDir}/assets/EMPTY.txt") + ch_reference + .branch { meta, file -> + zipped: file.name.endsWith('.gz') + unzipped: !file.name.endsWith('.gz') + } + .set {ch_input} + + // + // MODULE: UNZIP INPUTS IF NEEDED + // + GUNZIP ( + ch_input.zipped + ) + ch_versions = ch_versions.mix(GUNZIP.out.versions) + + + // + // LOGIC: MIX CHANELS WHICH MAY OR MAY NOT BE EMPTY INTO A SINGLE QUEUE CHANNEL + // + unzipped_input = Channel.empty() + + unzipped_input + .mix(ch_input.unzipped, GUNZIP.out.gunzip) + .set { unzipped_reference } + + // // MODULE: UPPERCASE THE REFERENCE SEQUENCE // GAWK_UPPER_SEQUENCE( - ch_reference, + unzipped_reference, [], false, ) From 84fa91ed2aa55a3346a28e4c95bab62bf6816ea1 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Thu, 21 Aug 2025 16:33:52 +0100 Subject: [PATCH 19/58] Minor Updates --- modules/local/pretext/graph/main.nf | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/modules/local/pretext/graph/main.nf b/modules/local/pretext/graph/main.nf index ac966417..49bec139 100644 --- a/modules/local/pretext/graph/main.nf +++ b/modules/local/pretext/graph/main.nf @@ -85,38 +85,35 @@ process PRETEXT_GRAPH { esac done - ls telomere/* - echo \$file_og - if [ -s "\$file_og" ]; then - echo "Processing OG_TELOMERE file..." + echo "Processing OG_TELOMERE file: \$file_og" PretextGraph $args -i "\$input_file" -n "og_telomere" -o telo_0.pretext < "\$file_og" else - echo "No OG TELOMERE file" + echo "OG TELOMERE file - Could be empty or missing" cp "\$input_file" telo_0.pretext fi if [ -s "\$file_telox" ]; then - echo "Processing TELOX_TELOMERE file..." + echo "Processing TELOX_TELOMERE file: \$file_telox" PretextGraph $args -i telo_0.pretext -n "telox_telomere" -o telo_1.pretext < "\$file_telox" else - echo "No TELOX file" + echo "TELOX file - Could be empty or missing" cp telo_0.pretext telo_1.pretext fi if [ -s "\$file_5p" ]; then - echo "Processing 5 Prime TELOMERE file..." + echo "Processing 5-Prime TELOMERE file: \$file_5p" PretextGraph $args -i telo_1.pretext -n "5p_telomere" -o telo_2.pretext < "\$file_5p" else - echo "No 5Prime TELOMERE file" + echo "5-Prime TELOMERE file - Could be empty or missing" cp telo_1.pretext telo_2.pretext fi if [ -s "\$file_3p" ]; then - echo "Processing 3 Prime TELOMERE file..." + echo "Processing 3-Prime TELOMERE file: \$file_3p" PretextGraph $args -i telo_2.pretext -n "3p_telomere" -o "${prefix}.pretext" < "\$file_3p" else - echo "No 3Prime TELOMERE file" + echo "3-Prime TELOMERE file - Could be empty or missing" cp telo_2.pretext "${prefix}.pretext" fi From 908783fffc73d8ba7c87c87d5fed36b6a1cba005 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Thu, 21 Aug 2025 16:40:53 +0100 Subject: [PATCH 20/58] Adding Data download to setup --- tests/main.nf.test | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tests/main.nf.test b/tests/main.nf.test index bede8ee8..bb46ee51 100644 --- a/tests/main.nf.test +++ b/tests/main.nf.test @@ -8,6 +8,17 @@ nextflow_pipeline { test("Full run") { + setup { + println "\nDownloading the test data..." + def command = ['bash', '-c', "curl https://tolit.cog.sanger.ac.uk/test-data/resources/treeval/TreeValTinyData.tar.gz | tar xzf - -C ${projectDir}/"] + def process = command.execute() + process.waitFor() + + if (process.exitValue() != 0) { + throw new RuntimeException("Error - failed to download ${dbKey}: ${process.err.text}") + } + } + when { params { outdir = "${outputDir}" From c92706dffecf0bf07231a5f275a9983e350c94e3 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Thu, 21 Aug 2025 16:41:56 +0100 Subject: [PATCH 21/58] Remove now duplicated data download --- .github/workflows/ci.yml | 5 ----- 1 file changed, 5 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 583d8add..cf35c042 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -67,11 +67,6 @@ jobs: mkdir -p $NXF_SINGULARITY_CACHEDIR mkdir -p $NXF_SINGULARITY_LIBRARYDIR - - name: Download test data - # Download A fungal test data set that is full enough to show some real output. - run: | - curl https://tolit.cog.sanger.ac.uk/test-data/resources/treeval/TreeValTinyData.tar.gz | tar xzf - - - name: Install nf-test uses: nf-core/setup-nf-test@v1 From 848cb1c4f8a8effd9436854bf7ad79e573dacc25 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Thu, 21 Aug 2025 16:43:09 +0100 Subject: [PATCH 22/58] Update CHANGELOG --- CHANGELOG.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0e297302..3956adb4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added and Fixed +- Template update to 3.3.3. . - Addition of the `--split_telomere` boolean flag, this is false by default. - When `true` the pipeline will split the telomere file into a 5 and 3 prime file. - Update `ACCESSORY_FILES` subworkflow: @@ -22,6 +23,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Moved `GAWK_UPPER_SEQUENCE` from the `TELO_FINDER` subworkflow to the first step of the main `curationpretext` workflow, this simply makes more sense. - Removed no longer needed scripts from bin. - Added the `gawk_split_directions.awk` script for split telomere. +- Addition of GUNZIP for the input reference genome. +- Update tests. ### Paramters @@ -41,6 +44,8 @@ Note, since the pipeline is using Nextflow DSL2, each process will be run with i | `GAWK_CLEAN_TELOMERE` | 5.3.0 | REMOVED | | `GAWK_MAP_TELO` | 5.3.0 | REMOVED | | `GET_LARGEST_SCAFF` | coreutils=9.1 | REMOVED | +| `GUNZIP` | NA | 1.13 | + ## [[1.4.2](https://github.com/sanger-tol/curationpretext/releases/tag/1.4.2)] - UNSC Nereid (H2) - [2025-07-28] From e335ab85ea3298e59c9da3eaecc2e79dfb71df39 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Thu, 21 Aug 2025 16:43:52 +0100 Subject: [PATCH 23/58] Update modules --- modules.json | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/modules.json b/modules.json index 30d74ced..61f01451 100644 --- a/modules.json +++ b/modules.json @@ -45,6 +45,11 @@ "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", "installed_by": ["modules"] }, + "gunzip": { + "branch": "master", + "git_sha": "41dfa3f7c0ffabb96a6a813fe321c6d1cc5b6e46", + "installed_by": ["modules"] + }, "minimap2/align": { "branch": "master", "git_sha": "05954dab2ff481bcb999f24455da29a5828af08d", From d549bc510a90a46504da95eb249eb8269787ba59 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Thu, 21 Aug 2025 16:46:52 +0100 Subject: [PATCH 24/58] Update --- modules/nf-core/gunzip/environment.yml | 12 ++ modules/nf-core/gunzip/main.nf | 55 +++++++ modules/nf-core/gunzip/meta.yml | 52 +++++++ modules/nf-core/gunzip/tests/main.nf.test | 121 ++++++++++++++++ .../nf-core/gunzip/tests/main.nf.test.snap | 134 ++++++++++++++++++ modules/nf-core/gunzip/tests/nextflow.config | 5 + 6 files changed, 379 insertions(+) create mode 100644 modules/nf-core/gunzip/environment.yml create mode 100644 modules/nf-core/gunzip/main.nf create mode 100644 modules/nf-core/gunzip/meta.yml create mode 100644 modules/nf-core/gunzip/tests/main.nf.test create mode 100644 modules/nf-core/gunzip/tests/main.nf.test.snap create mode 100644 modules/nf-core/gunzip/tests/nextflow.config diff --git a/modules/nf-core/gunzip/environment.yml b/modules/nf-core/gunzip/environment.yml new file mode 100644 index 00000000..9b926b1f --- /dev/null +++ b/modules/nf-core/gunzip/environment.yml @@ -0,0 +1,12 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::coreutils=9.5 + - conda-forge::grep=3.11 + - conda-forge::gzip=1.13 + - conda-forge::lbzip2=2.5 + - conda-forge::sed=4.8 + - conda-forge::tar=1.34 diff --git a/modules/nf-core/gunzip/main.nf b/modules/nf-core/gunzip/main.nf new file mode 100644 index 00000000..3ffc8e92 --- /dev/null +++ b/modules/nf-core/gunzip/main.nf @@ -0,0 +1,55 @@ +process GUNZIP { + tag "${archive}" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container + ? 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/52/52ccce28d2ab928ab862e25aae26314d69c8e38bd41ca9431c67ef05221348aa/data' + : 'community.wave.seqera.io/library/coreutils_grep_gzip_lbzip2_pruned:838ba80435a629f8'}" + + input: + tuple val(meta), path(archive) + + output: + tuple val(meta), path("${gunzip}"), emit: gunzip + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def extension = (archive.toString() - '.gz').tokenize('.')[-1] + def name = archive.toString() - '.gz' - ".${extension}" + def prefix = task.ext.prefix ?: name + gunzip = prefix + ".${extension}" + """ + # Not calling gunzip itself because it creates files + # with the original group ownership rather than the + # default one for that user / the work directory + gzip \\ + -cd \\ + ${args} \\ + ${archive} \\ + > ${gunzip} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gunzip: \$(echo \$(gunzip --version 2>&1) | sed 's/^.*(gzip) //; s/ Copyright.*\$//') + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def extension = (archive.toString() - '.gz').tokenize('.')[-1] + def name = archive.toString() - '.gz' - ".${extension}" + def prefix = task.ext.prefix ?: name + gunzip = prefix + ".${extension}" + """ + touch ${gunzip} + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gunzip: \$(echo \$(gunzip --version 2>&1) | sed 's/^.*(gzip) //; s/ Copyright.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/gunzip/meta.yml b/modules/nf-core/gunzip/meta.yml new file mode 100644 index 00000000..926bb22a --- /dev/null +++ b/modules/nf-core/gunzip/meta.yml @@ -0,0 +1,52 @@ +name: gunzip +description: Compresses and decompresses files. +keywords: + - gunzip + - compression + - decompression +tools: + - gunzip: + description: | + gzip is a file format and a software application used for file compression and decompression. + documentation: https://www.gnu.org/software/gzip/manual/gzip.html + licence: ["GPL-3.0-or-later"] + identifier: "" +input: + - - meta: + type: map + description: | + Optional groovy Map containing meta information + e.g. [ id:'test', single_end:false ] + - archive: + type: file + description: File to be compressed/uncompressed + pattern: "*.*" + ontologies: [] +output: + gunzip: + - - meta: + type: file + description: Compressed/uncompressed file + pattern: "*.*" + ontologies: [] + - ${gunzip}: + type: file + description: Compressed/uncompressed file + pattern: "*.*" + ontologies: [] + versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" + ontologies: + - edam: http://edamontology.org/format_3750 # YAML +authors: + - "@joseespinosa" + - "@drpatelh" + - "@jfy133" +maintainers: + - "@joseespinosa" + - "@drpatelh" + - "@jfy133" + - "@gallvp" diff --git a/modules/nf-core/gunzip/tests/main.nf.test b/modules/nf-core/gunzip/tests/main.nf.test new file mode 100644 index 00000000..776211ad --- /dev/null +++ b/modules/nf-core/gunzip/tests/main.nf.test @@ -0,0 +1,121 @@ +nextflow_process { + + name "Test Process GUNZIP" + script "../main.nf" + process "GUNZIP" + tag "gunzip" + tag "modules_nfcore" + tag "modules" + + test("Should run without failures") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = Channel.of([ + [], + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true) + ] + ) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("Should run without failures - prefix") { + + config './nextflow.config' + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = Channel.of([ + [ id: 'test' ], + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true) + ] + ) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("Should run without failures - stub") { + + options '-stub' + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = Channel.of([ + [], + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true) + ] + ) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("Should run without failures - prefix - stub") { + + options '-stub' + config './nextflow.config' + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = Channel.of([ + [ id: 'test' ], + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true) + ] + ) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/modules/nf-core/gunzip/tests/main.nf.test.snap b/modules/nf-core/gunzip/tests/main.nf.test.snap new file mode 100644 index 00000000..a0f0e67e --- /dev/null +++ b/modules/nf-core/gunzip/tests/main.nf.test.snap @@ -0,0 +1,134 @@ +{ + "Should run without failures - prefix - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.xyz.fastq:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + "versions.yml:md5,d327e4a19a6d5c5e974136cef8999d8c" + ], + "gunzip": [ + [ + { + "id": "test" + }, + "test.xyz.fastq:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,d327e4a19a6d5c5e974136cef8999d8c" + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.2" + }, + "timestamp": "2024-12-13T11:48:22.080222697" + }, + "Should run without failures - stub": { + "content": [ + { + "0": [ + [ + [ + + ], + "test_1.fastq:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + "versions.yml:md5,d327e4a19a6d5c5e974136cef8999d8c" + ], + "gunzip": [ + [ + [ + + ], + "test_1.fastq:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,d327e4a19a6d5c5e974136cef8999d8c" + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.2" + }, + "timestamp": "2024-12-13T11:48:14.593020264" + }, + "Should run without failures": { + "content": [ + { + "0": [ + [ + [ + + ], + "test_1.fastq:md5,4161df271f9bfcd25d5845a1e220dbec" + ] + ], + "1": [ + "versions.yml:md5,d327e4a19a6d5c5e974136cef8999d8c" + ], + "gunzip": [ + [ + [ + + ], + "test_1.fastq:md5,4161df271f9bfcd25d5845a1e220dbec" + ] + ], + "versions": [ + "versions.yml:md5,d327e4a19a6d5c5e974136cef8999d8c" + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.2" + }, + "timestamp": "2024-12-13T11:48:01.295397925" + }, + "Should run without failures - prefix": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.xyz.fastq:md5,4161df271f9bfcd25d5845a1e220dbec" + ] + ], + "1": [ + "versions.yml:md5,d327e4a19a6d5c5e974136cef8999d8c" + ], + "gunzip": [ + [ + { + "id": "test" + }, + "test.xyz.fastq:md5,4161df271f9bfcd25d5845a1e220dbec" + ] + ], + "versions": [ + "versions.yml:md5,d327e4a19a6d5c5e974136cef8999d8c" + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.2" + }, + "timestamp": "2024-12-13T11:48:07.414271387" + } +} \ No newline at end of file diff --git a/modules/nf-core/gunzip/tests/nextflow.config b/modules/nf-core/gunzip/tests/nextflow.config new file mode 100644 index 00000000..dec77642 --- /dev/null +++ b/modules/nf-core/gunzip/tests/nextflow.config @@ -0,0 +1,5 @@ +process { + withName: GUNZIP { + ext.prefix = { "${meta.id}.xyz" } + } +} From eb53abe819672e253aebf40b4ae84fa821b2fe8e Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Thu, 21 Aug 2025 20:55:54 +0100 Subject: [PATCH 25/58] Update --- modules.json | 98 ++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 73 insertions(+), 25 deletions(-) diff --git a/modules.json b/modules.json index 61f01451..1fddd1b6 100644 --- a/modules.json +++ b/modules.json @@ -8,109 +8,151 @@ "bedtools/bamtobed": { "branch": "master", "git_sha": "05954dab2ff481bcb999f24455da29a5828af08d", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "bedtools/genomecov": { "branch": "master", "git_sha": "81880787133db07d9b4c1febd152c090eb8325dc", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "bedtools/intersect": { "branch": "master", "git_sha": "05954dab2ff481bcb999f24455da29a5828af08d", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "bedtools/makewindows": { "branch": "master", "git_sha": "81880787133db07d9b4c1febd152c090eb8325dc", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "bedtools/map": { "branch": "master", "git_sha": "05954dab2ff481bcb999f24455da29a5828af08d", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "bwamem2/index": { "branch": "master", "git_sha": "a29f18660f5e3748d44d6f716241e70c942c065d", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "gawk": { "branch": "master", "git_sha": "05954dab2ff481bcb999f24455da29a5828af08d", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "gnu/sort": { "branch": "master", "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "gunzip": { "branch": "master", "git_sha": "41dfa3f7c0ffabb96a6a813fe321c6d1cc5b6e46", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "minimap2/align": { "branch": "master", "git_sha": "05954dab2ff481bcb999f24455da29a5828af08d", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "minimap2/index": { "branch": "master", "git_sha": "05954dab2ff481bcb999f24455da29a5828af08d", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "pretextmap": { "branch": "master", "git_sha": "05954dab2ff481bcb999f24455da29a5828af08d", - "installed_by": ["modules"], + "installed_by": [ + "modules" + ], "patch": "modules/nf-core/pretextmap/pretextmap.diff" }, "pretextsnapshot": { "branch": "master", "git_sha": "81880787133db07d9b4c1febd152c090eb8325dc", - "installed_by": ["modules"], + "installed_by": [ + "modules" + ], "patch": "modules/nf-core/pretextsnapshot/pretextsnapshot.diff" }, "samtools/faidx": { "branch": "master", "git_sha": "05954dab2ff481bcb999f24455da29a5828af08d", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "samtools/merge": { "branch": "master", "git_sha": "05954dab2ff481bcb999f24455da29a5828af08d", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "samtools/sort": { "branch": "master", "git_sha": "05954dab2ff481bcb999f24455da29a5828af08d", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "samtools/view": { "branch": "master", "git_sha": "05954dab2ff481bcb999f24455da29a5828af08d", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "seqtk/cutn": { "branch": "master", "git_sha": "05954dab2ff481bcb999f24455da29a5828af08d", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "ucsc/bedgraphtobigwig": { "branch": "master", "git_sha": "05954dab2ff481bcb999f24455da29a5828af08d", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "windowmasker/mkcounts": { "branch": "master", "git_sha": "05954dab2ff481bcb999f24455da29a5828af08d", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "windowmasker/ustat": { "branch": "master", "git_sha": "05954dab2ff481bcb999f24455da29a5828af08d", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] } } }, @@ -119,20 +161,26 @@ "utils_nextflow_pipeline": { "branch": "master", "git_sha": "c2b22d85f30a706a3073387f30380704fcae013b", - "installed_by": ["subworkflows"] + "installed_by": [ + "subworkflows" + ] }, "utils_nfcore_pipeline": { "branch": "master", "git_sha": "51ae5406a030d4da1e49e4dab49756844fdd6c7a", - "installed_by": ["subworkflows"] + "installed_by": [ + "subworkflows" + ] }, "utils_nfschema_plugin": { "branch": "master", "git_sha": "2fd2cd6d0e7b273747f32e465fdc6bcc3ae0814e", - "installed_by": ["subworkflows"] + "installed_by": [ + "subworkflows" + ] } } } } } -} +} \ No newline at end of file From 2398af56ec7694075813c136750a07f5fff57100 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Fri, 22 Aug 2025 10:45:27 +0100 Subject: [PATCH 26/58] Update Tests --- tests/main.nf.test.snap | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/main.nf.test.snap b/tests/main.nf.test.snap index 4b352b5c..8d8f6ea3 100644 --- a/tests/main.nf.test.snap +++ b/tests/main.nf.test.snap @@ -150,7 +150,7 @@ "CurationPretextTest_5P_telomere.bedgraph:md5,d41d8cd98f00b204e9800998ecf8427e", "CurationPretextTest_telomere.bed:md5,d41d8cd98f00b204e9800998ecf8427e", "CurationPretextTest_telomere.bedgraph:md5,d41d8cd98f00b204e9800998ecf8427e", - "coverage.bigWig:md5,39b3e8b7751b33758087cafc9a3c689e" + "coverage.bigWig:md5,2e474506c957152b231ac63c859f0b17" ], 9, 1, @@ -166,6 +166,6 @@ "nf-test": "0.9.2", "nextflow": "25.04.6" }, - "timestamp": "2025-08-21T16:06:51.036682" + "timestamp": "2025-08-21T21:25:49.92252227" } } \ No newline at end of file From 08ff1fa61e4a0fe3bfe2b95a36dbd4a5e5b0b5ad Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Fri, 22 Aug 2025 11:04:45 +0100 Subject: [PATCH 27/58] Prettier! --- CHANGELOG.md | 1 - 1 file changed, 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3956adb4..b04b2082 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -46,7 +46,6 @@ Note, since the pipeline is using Nextflow DSL2, each process will be run with i | `GET_LARGEST_SCAFF` | coreutils=9.1 | REMOVED | | `GUNZIP` | NA | 1.13 | - ## [[1.4.2](https://github.com/sanger-tol/curationpretext/releases/tag/1.4.2)] - UNSC Nereid (H2) - [2025-07-28] ### Added and Fixed From 6feb191e5d7ef3805c0fc9d2a3a33ccdbe23c209 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Fri, 22 Aug 2025 11:28:56 +0100 Subject: [PATCH 28/58] Prettier --- modules.json | 98 ++++++++++++++-------------------------------------- 1 file changed, 25 insertions(+), 73 deletions(-) diff --git a/modules.json b/modules.json index 1fddd1b6..61f01451 100644 --- a/modules.json +++ b/modules.json @@ -8,151 +8,109 @@ "bedtools/bamtobed": { "branch": "master", "git_sha": "05954dab2ff481bcb999f24455da29a5828af08d", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "bedtools/genomecov": { "branch": "master", "git_sha": "81880787133db07d9b4c1febd152c090eb8325dc", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "bedtools/intersect": { "branch": "master", "git_sha": "05954dab2ff481bcb999f24455da29a5828af08d", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "bedtools/makewindows": { "branch": "master", "git_sha": "81880787133db07d9b4c1febd152c090eb8325dc", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "bedtools/map": { "branch": "master", "git_sha": "05954dab2ff481bcb999f24455da29a5828af08d", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "bwamem2/index": { "branch": "master", "git_sha": "a29f18660f5e3748d44d6f716241e70c942c065d", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "gawk": { "branch": "master", "git_sha": "05954dab2ff481bcb999f24455da29a5828af08d", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "gnu/sort": { "branch": "master", "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "gunzip": { "branch": "master", "git_sha": "41dfa3f7c0ffabb96a6a813fe321c6d1cc5b6e46", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "minimap2/align": { "branch": "master", "git_sha": "05954dab2ff481bcb999f24455da29a5828af08d", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "minimap2/index": { "branch": "master", "git_sha": "05954dab2ff481bcb999f24455da29a5828af08d", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "pretextmap": { "branch": "master", "git_sha": "05954dab2ff481bcb999f24455da29a5828af08d", - "installed_by": [ - "modules" - ], + "installed_by": ["modules"], "patch": "modules/nf-core/pretextmap/pretextmap.diff" }, "pretextsnapshot": { "branch": "master", "git_sha": "81880787133db07d9b4c1febd152c090eb8325dc", - "installed_by": [ - "modules" - ], + "installed_by": ["modules"], "patch": "modules/nf-core/pretextsnapshot/pretextsnapshot.diff" }, "samtools/faidx": { "branch": "master", "git_sha": "05954dab2ff481bcb999f24455da29a5828af08d", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "samtools/merge": { "branch": "master", "git_sha": "05954dab2ff481bcb999f24455da29a5828af08d", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "samtools/sort": { "branch": "master", "git_sha": "05954dab2ff481bcb999f24455da29a5828af08d", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "samtools/view": { "branch": "master", "git_sha": "05954dab2ff481bcb999f24455da29a5828af08d", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "seqtk/cutn": { "branch": "master", "git_sha": "05954dab2ff481bcb999f24455da29a5828af08d", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "ucsc/bedgraphtobigwig": { "branch": "master", "git_sha": "05954dab2ff481bcb999f24455da29a5828af08d", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "windowmasker/mkcounts": { "branch": "master", "git_sha": "05954dab2ff481bcb999f24455da29a5828af08d", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "windowmasker/ustat": { "branch": "master", "git_sha": "05954dab2ff481bcb999f24455da29a5828af08d", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] } } }, @@ -161,26 +119,20 @@ "utils_nextflow_pipeline": { "branch": "master", "git_sha": "c2b22d85f30a706a3073387f30380704fcae013b", - "installed_by": [ - "subworkflows" - ] + "installed_by": ["subworkflows"] }, "utils_nfcore_pipeline": { "branch": "master", "git_sha": "51ae5406a030d4da1e49e4dab49756844fdd6c7a", - "installed_by": [ - "subworkflows" - ] + "installed_by": ["subworkflows"] }, "utils_nfschema_plugin": { "branch": "master", "git_sha": "2fd2cd6d0e7b273747f32e465fdc6bcc3ae0814e", - "installed_by": [ - "subworkflows" - ] + "installed_by": ["subworkflows"] } } } } } -} \ No newline at end of file +} From ab458848c03ca248d1c321432398489fe1a5a293 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Fri, 22 Aug 2025 16:55:53 +0100 Subject: [PATCH 29/58] Correct the direction of the telo --- modules/local/gawk/main.nf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/local/gawk/main.nf b/modules/local/gawk/main.nf index f7f34b2e..f8e631ea 100644 --- a/modules/local/gawk/main.nf +++ b/modules/local/gawk/main.nf @@ -13,8 +13,8 @@ process GAWK { val(disable_redirect_output) output: - tuple val(meta), path("direction.0.${suffix}"), emit: prime3 - tuple val(meta), path("direction.1.${suffix}"), emit: prime5 + tuple val(meta), path("direction.0.${suffix}"), emit: prime5 + tuple val(meta), path("direction.1.${suffix}"), emit: prime3 path "versions.yml" , emit: versions when: From d2ace1fe7326afa16720ac6a8863c363536f447b Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Tue, 26 Aug 2025 14:44:09 +0100 Subject: [PATCH 30/58] Update based on comments --- modules/local/extract/telo/main.nf | 43 ---------------- modules/local/extract/telomere/main.nf | 2 +- modules/local/gawk/environment.yml | 7 --- modules/local/gawk/main.nf | 68 -------------------------- modules/local/gawk/meta.yml | 63 ------------------------ subworkflows/local/telo_finder/main.nf | 5 +- 6 files changed, 3 insertions(+), 185 deletions(-) delete mode 100755 modules/local/extract/telo/main.nf delete mode 100644 modules/local/gawk/environment.yml delete mode 100644 modules/local/gawk/main.nf delete mode 100644 modules/local/gawk/meta.yml diff --git a/modules/local/extract/telo/main.nf b/modules/local/extract/telo/main.nf deleted file mode 100755 index 380c1acf..00000000 --- a/modules/local/extract/telo/main.nf +++ /dev/null @@ -1,43 +0,0 @@ -process EXTRACT_TELO { - tag "${meta.id}" - label 'process_low' - - conda "conda-forge::coreutils=9.1" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : - 'docker.io/ubuntu:20.04' }" - - input: - tuple val( meta ), path( file ) - - output: - tuple val( meta ), file( "*bed" ) , emit: bed - tuple val( meta ), file("*bedgraph"), emit: bedgraph - path "versions.yml" , emit: versions - - script: - def prefix = task.ext.prefix ?: "${meta.id}" - def VERSION = "9.1" // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions. - """ - cat "${file}" | awk '{print \$2"\\t"\$4"\\t"\$5}' | sed 's/>//g' > ${prefix}_telomere.bed - cat "${file}" | awk '{print \$2"\\t"\$4"\\t"\$5"\\t"(((\$5-\$4)<0)?-(\$5-\$4):(\$5-\$4))}' | sed 's/>//g' > ${prefix}_telomere.bedgraph - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - coreutils: $VERSION - END_VERSIONS - """ - - stub: - def prefix = task.ext.prefix ?: "${meta.id}" - def VERSION = "9.1" // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions. - """ - touch ${prefix}_telomere.bed - touch ${prefix}_telomere.bedgraph - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - coreutils: $VERSION - END_VERSIONS - """ -} diff --git a/modules/local/extract/telomere/main.nf b/modules/local/extract/telomere/main.nf index a0ce237d..41022f00 100644 --- a/modules/local/extract/telomere/main.nf +++ b/modules/local/extract/telomere/main.nf @@ -1,6 +1,6 @@ process EXTRACT_TELOMERE { tag "${meta.id}" - label 'process_low' + label 'process_single' conda "conda-forge::coreutils=9.1" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? diff --git a/modules/local/gawk/environment.yml b/modules/local/gawk/environment.yml deleted file mode 100644 index f52109e8..00000000 --- a/modules/local/gawk/environment.yml +++ /dev/null @@ -1,7 +0,0 @@ ---- -# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json -channels: - - conda-forge - - bioconda -dependencies: - - conda-forge::gawk=5.3.0 diff --git a/modules/local/gawk/main.nf b/modules/local/gawk/main.nf deleted file mode 100644 index f8e631ea..00000000 --- a/modules/local/gawk/main.nf +++ /dev/null @@ -1,68 +0,0 @@ -process GAWK { - tag "$meta.id" - label 'process_single' - - conda "${moduleDir}/environment.yml" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/gawk:5.3.0' : - 'biocontainers/gawk:5.3.0' }" - - input: - tuple val(meta), path(input, arity: '0..*') - path(program_file) - val(disable_redirect_output) - - output: - tuple val(meta), path("direction.0.${suffix}"), emit: prime5 - tuple val(meta), path("direction.1.${suffix}"), emit: prime3 - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' // args is used for the main arguments of the tool - def args2 = task.ext.args2 ?: '' // args2 is used to specify a program when no program file has been given - prefix = task.ext.prefix ?: "${meta.id}" - suffix = task.ext.suffix ?: "${input.collect{ it.getExtension()}.get(0)}" // use the first extension of the input files - - program = program_file ? "-f ${program_file}" : "${args2}" - lst_gz = input.findResults{ it.getExtension().endsWith("gz") ? it.toString() : null } - unzip = lst_gz ? "gunzip -q -f ${lst_gz.join(" ")}" : "" - input_cmd = input.collect { it.toString() - ~/\.gz$/ }.join(" ") - cleanup = lst_gz ? "rm ${lst_gz.collect{ it - ~/\.gz$/ }.join(" ")}" : "" - - input.collect{ - assert it.name != "${prefix}.${suffix}" : "Input and output names are the same, set prefix in module configuration to disambiguate!" - } - - """ - ${unzip} - - awk \\ - ${args} \\ - ${program} \\ - ${input_cmd} - - ${cleanup} - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - gawk: \$(awk -Wversion | sed '1!d; s/.*Awk //; s/,.*//') - END_VERSIONS - """ - - stub: - prefix = task.ext.prefix ?: "${meta.id}" - suffix = task.ext.suffix ?: "${input.getExtension()}" - def create_cmd = suffix.endsWith("gz") ? "echo '' | gzip >" : "touch" - - """ - ${create_cmd} ${prefix}.${suffix} - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - gawk: \$(awk -Wversion | sed '1!d; s/.*Awk //; s/,.*//') - END_VERSIONS - """ -} diff --git a/modules/local/gawk/meta.yml b/modules/local/gawk/meta.yml deleted file mode 100644 index 34c50b12..00000000 --- a/modules/local/gawk/meta.yml +++ /dev/null @@ -1,63 +0,0 @@ -name: "gawk" -description: | - If you are like many computer users, you would frequently like to make changes in various text files - wherever certain patterns appear, or extract data from parts of certain lines while discarding the rest. - The job is easy with awk, especially the GNU implementation gawk. -keywords: - - gawk - - awk - - txt - - text - - file parsing -tools: - - "gawk": - description: "GNU awk" - homepage: "https://www.gnu.org/software/gawk/" - documentation: "https://www.gnu.org/software/gawk/manual/" - tool_dev_url: "https://www.gnu.org/prep/ftp.html" - licence: ["GPL v3"] - identifier: "" -input: - - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - input: - type: file - description: The input file - Specify the logic that needs to be executed on - this file on the `ext.args2` or in the program file. - If the files have a `.gz` extension, they will be unzipped using `zcat`. - pattern: "*" - - - program_file: - type: file - description: Optional file containing logic for awk to execute. If you don't - wish to use a file, you can use `ext.args2` to specify the logic. - pattern: "*" - - - disable_redirect_output: - type: boolean - description: Disable the redirection of awk output to a given file. This is - useful if you want to use awk's built-in redirect to write files instead - of the shell's redirect. -output: - - output: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - "*.${suffix}": - type: file - description: The output file - if using shell redirection, specify the name of this - file using `ext.prefix` and the extension using `ext.suffix`. Otherwise, ensure - the awk program produces files with the extension in `ext.suffix`. - pattern: "*" - - versions: - - versions.yml: - type: file - description: File containing software versions - pattern: "versions.yml" -authors: - - "@nvnieuwk" -maintainers: - - "@nvnieuwk" diff --git a/subworkflows/local/telo_finder/main.nf b/subworkflows/local/telo_finder/main.nf index ce827a3d..cdf0d223 100644 --- a/subworkflows/local/telo_finder/main.nf +++ b/subworkflows/local/telo_finder/main.nf @@ -4,7 +4,7 @@ // MODULE IMPORT BLOCK // include { FIND_TELOMERE_REGIONS } from '../../../modules/local/find/telomere_regions/main' -include { GAWK as GAWK_SPLIT_DIRECTIONS } from '../../../modules/local/gawk/main' +include { GAWK_SPLIT_DIRECTIONS } from '../../../modules/local/gawk_split_directions/main' include { TELO_EXTRACTION } from '../../../subworkflows/local/telo_extraction/main' @@ -35,8 +35,7 @@ workflow TELO_FINDER { if (params.split_telomere) { GAWK_SPLIT_DIRECTIONS ( FIND_TELOMERE_REGIONS.out.telomere, - file("${projectDir}/bin/gawk_split_directions.awk"), - false + file("${projectDir}/bin/gawk_split_directions.awk") ) ch_versions = ch_versions.mix( GAWK_SPLIT_DIRECTIONS.out.versions ) From 17590d3f6f5395f5edd03a6c8778fe6566f289ab Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Tue, 26 Aug 2025 14:44:38 +0100 Subject: [PATCH 31/58] Update based on comments --- .../gawk_split_directions/environment.yml | 7 +++ modules/local/gawk_split_directions/main.nf | 58 +++++++++++++++++ modules/local/gawk_split_directions/meta.yml | 63 +++++++++++++++++++ 3 files changed, 128 insertions(+) create mode 100644 modules/local/gawk_split_directions/environment.yml create mode 100644 modules/local/gawk_split_directions/main.nf create mode 100644 modules/local/gawk_split_directions/meta.yml diff --git a/modules/local/gawk_split_directions/environment.yml b/modules/local/gawk_split_directions/environment.yml new file mode 100644 index 00000000..f52109e8 --- /dev/null +++ b/modules/local/gawk_split_directions/environment.yml @@ -0,0 +1,7 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::gawk=5.3.0 diff --git a/modules/local/gawk_split_directions/main.nf b/modules/local/gawk_split_directions/main.nf new file mode 100644 index 00000000..29b4af8a --- /dev/null +++ b/modules/local/gawk_split_directions/main.nf @@ -0,0 +1,58 @@ +process GAWK_SPLIT_DIRECTIONS { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/gawk:5.3.0' : + 'biocontainers/gawk:5.3.0' }" + + input: + tuple val(meta), path(input) + path(program_file) + + output: + tuple val(meta), path("direction.0.${suffix}"), emit: prime5 + tuple val(meta), path("direction.1.${suffix}"), emit: prime3 + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' // args is used for the main arguments of the tool + def args2 = task.ext.args2 ?: '' // args2 is used to specify a program when no program file has been given + prefix = task.ext.prefix ?: "${meta.id}" + suffix = task.ext.suffix ?: "${input.collect{ it.getExtension()}.get(0)}" // use the first extension of the input files + + program = program_file ? "-f ${program_file}" : "${args2}" + + input.collect{ + assert it.name != "${prefix}.${suffix}" : "Input and output names are the same, set prefix in module configuration to disambiguate!" + } + + """ + awk \\ + ${args} \\ + ${program} \\ + ${input} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gawk: \$(awk -Wversion | sed '1!d; s/.*Awk //; s/,.*//') + END_VERSIONS + """ + + stub: + prefix = task.ext.prefix ?: "${meta.id}" + suffix = task.ext.suffix ?: "${input.getExtension()}" + + """ + touch ${prefix}.${suffix} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gawk: \$(awk -Wversion | sed '1!d; s/.*Awk //; s/,.*//') + END_VERSIONS + """ +} diff --git a/modules/local/gawk_split_directions/meta.yml b/modules/local/gawk_split_directions/meta.yml new file mode 100644 index 00000000..34c50b12 --- /dev/null +++ b/modules/local/gawk_split_directions/meta.yml @@ -0,0 +1,63 @@ +name: "gawk" +description: | + If you are like many computer users, you would frequently like to make changes in various text files + wherever certain patterns appear, or extract data from parts of certain lines while discarding the rest. + The job is easy with awk, especially the GNU implementation gawk. +keywords: + - gawk + - awk + - txt + - text + - file parsing +tools: + - "gawk": + description: "GNU awk" + homepage: "https://www.gnu.org/software/gawk/" + documentation: "https://www.gnu.org/software/gawk/manual/" + tool_dev_url: "https://www.gnu.org/prep/ftp.html" + licence: ["GPL v3"] + identifier: "" +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - input: + type: file + description: The input file - Specify the logic that needs to be executed on + this file on the `ext.args2` or in the program file. + If the files have a `.gz` extension, they will be unzipped using `zcat`. + pattern: "*" + - - program_file: + type: file + description: Optional file containing logic for awk to execute. If you don't + wish to use a file, you can use `ext.args2` to specify the logic. + pattern: "*" + - - disable_redirect_output: + type: boolean + description: Disable the redirection of awk output to a given file. This is + useful if you want to use awk's built-in redirect to write files instead + of the shell's redirect. +output: + - output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.${suffix}": + type: file + description: The output file - if using shell redirection, specify the name of this + file using `ext.prefix` and the extension using `ext.suffix`. Otherwise, ensure + the awk program produces files with the extension in `ext.suffix`. + pattern: "*" + - versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@nvnieuwk" +maintainers: + - "@nvnieuwk" From 2c8a2715ba7ba458caeb3ed2d3807b6d7ebaad4c Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Tue, 26 Aug 2025 14:49:47 +0100 Subject: [PATCH 32/58] Update --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index b04b2082..c7ef5be3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Better formatting in some files. - Moved `GAWK_UPPER_SEQUENCE` from the `TELO_FINDER` subworkflow to the first step of the main `curationpretext` workflow, this simply makes more sense. - Removed no longer needed scripts from bin. +- Added the module `GAWK_SPLIT_DIRECTIONS` module, a local copy of the nf-core `GAWK` module. - Added the `gawk_split_directions.awk` script for split telomere. - Addition of GUNZIP for the input reference genome. - Update tests. @@ -45,6 +46,7 @@ Note, since the pipeline is using Nextflow DSL2, each process will be run with i | `GAWK_MAP_TELO` | 5.3.0 | REMOVED | | `GET_LARGEST_SCAFF` | coreutils=9.1 | REMOVED | | `GUNZIP` | NA | 1.13 | +| `GAWK_SPLIT_DIRECTIONS | NA | 5.3.0 | ## [[1.4.2](https://github.com/sanger-tol/curationpretext/releases/tag/1.4.2)] - UNSC Nereid (H2) - [2025-07-28] From f4b52e7989cc3dff37cef8d923939191bf34ff81 Mon Sep 17 00:00:00 2001 From: Damon-Lee Pointon <51855558+DLBPointon@users.noreply.github.com> Date: Tue, 26 Aug 2025 17:26:53 +0100 Subject: [PATCH 33/58] Update CHANGELOG.md Missed a '`' --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c7ef5be3..e7f8b932 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -46,7 +46,7 @@ Note, since the pipeline is using Nextflow DSL2, each process will be run with i | `GAWK_MAP_TELO` | 5.3.0 | REMOVED | | `GET_LARGEST_SCAFF` | coreutils=9.1 | REMOVED | | `GUNZIP` | NA | 1.13 | -| `GAWK_SPLIT_DIRECTIONS | NA | 5.3.0 | +| `GAWK_SPLIT_DIRECTIONS` | NA | 5.3.0 | ## [[1.4.2](https://github.com/sanger-tol/curationpretext/releases/tag/1.4.2)] - UNSC Nereid (H2) - [2025-07-28] From ea76b009a1bd99c97b1f5e587f04368f35428d3e Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Thu, 28 Aug 2025 11:58:33 +0100 Subject: [PATCH 34/58] Patch --- conf/base.config | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/conf/base.config b/conf/base.config index 9add5450..0e65e0a4 100644 --- a/conf/base.config +++ b/conf/base.config @@ -21,6 +21,7 @@ process { withName:SAMTOOLS_MERGE { cpus = { 16 } memory = { 50.GB * task.attempt } + time = { 20.h * task.attempt } } withName: '.*:.*:LONGREAD_COVERAGE:(MINIMAP2_ALIGN|MINIMAP2_ALIGN_SPLIT)' { @@ -86,6 +87,11 @@ process { memory = { 1.GB * task.attempt } } + withName: BEDTOOLS_INTERSECT { + memory = { 10.GB * task.attempt } + time = { 20.h * task.attempt } + } + // Process-specific resource requirements // NOTE - Please try and reuse the labels below as much as possible. // These labels are used and recognised by default in DSL2 files hosted on nf-core/modules. @@ -100,7 +106,7 @@ process { withLabel:process_low { cpus = { 2 * task.attempt } memory = { 12.GB * task.attempt } - time = { 4.h * task.attempt } + time = { 20.h * task.attempt } } withLabel:process_medium { cpus = { 6 * task.attempt } From bb0308d39e8f43fb5d5d98ab952fabe06cb2fe48 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Thu, 28 Aug 2025 11:59:18 +0100 Subject: [PATCH 35/58] Patch --- conf/base.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/base.config b/conf/base.config index 0e65e0a4..e250d8dc 100644 --- a/conf/base.config +++ b/conf/base.config @@ -106,7 +106,7 @@ process { withLabel:process_low { cpus = { 2 * task.attempt } memory = { 12.GB * task.attempt } - time = { 20.h * task.attempt } + time = { 4.h * task.attempt } } withLabel:process_medium { cpus = { 6 * task.attempt } From a80391efbe45f95305b78a44185dcfdf3e334af8 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Thu, 28 Aug 2025 16:47:54 +0100 Subject: [PATCH 36/58] Template update for nf-core/tools version 3.3.2 --- .editorconfig | 37 ----- .github/CONTRIBUTING.md | 2 +- .github/actions/get-shards/action.yml | 69 +++++++++ .github/actions/nf-test/action.yml | 109 ++++++++++++++ .github/workflows/ci.yml | 88 ----------- .github/workflows/clean-up.yml | 2 +- .github/workflows/download_pipeline.yml | 20 +-- .../{fix-linting.yml => fix_linting.yml} | 4 +- .github/workflows/linting.yml | 17 +-- .github/workflows/linting_comment.yml | 4 +- .github/workflows/nf-test.yml | 140 ++++++++++++++++++ ...mment.yml => template-version-comment.yml} | 2 +- .nf-core.yml | 4 +- .pre-commit-config.yaml | 26 +++- .prettierrc.yml | 5 + CHANGELOG.md | 2 +- README.md | 7 +- assets/schema_input.json | 4 +- conf/base.config | 6 +- nextflow.config | 22 ++- nf-test.config | 24 +++ .../main.nf | 1 - .../tests/nextflow.config | 2 +- tests/.nftignore | 2 + tests/default.nf.test | 35 +++++ tests/nextflow.config | 14 ++ 26 files changed, 472 insertions(+), 176 deletions(-) delete mode 100644 .editorconfig create mode 100644 .github/actions/get-shards/action.yml create mode 100644 .github/actions/nf-test/action.yml delete mode 100644 .github/workflows/ci.yml rename .github/workflows/{fix-linting.yml => fix_linting.yml} (96%) create mode 100644 .github/workflows/nf-test.yml rename .github/workflows/{template_version_comment.yml => template-version-comment.yml} (95%) create mode 100644 nf-test.config create mode 100644 tests/.nftignore create mode 100644 tests/default.nf.test create mode 100644 tests/nextflow.config diff --git a/.editorconfig b/.editorconfig deleted file mode 100644 index 6d9b74cc..00000000 --- a/.editorconfig +++ /dev/null @@ -1,37 +0,0 @@ -root = true - -[*] -charset = utf-8 -end_of_line = lf -insert_final_newline = true -trim_trailing_whitespace = true -indent_size = 4 -indent_style = space - -[*.{md,yml,yaml,html,css,scss,js}] -indent_size = 2 - -# These files are edited and tested upstream in nf-core/modules -[/modules/nf-core/**] -charset = unset -end_of_line = unset -insert_final_newline = unset -trim_trailing_whitespace = unset -indent_style = unset -[/subworkflows/nf-core/**] -charset = unset -end_of_line = unset -insert_final_newline = unset -trim_trailing_whitespace = unset -indent_style = unset - -[/assets/email*] -indent_size = unset - -# ignore python and markdown -[*.{py,md}] -indent_style = unset - -# ignore ro-crate metadata files -[**/ro-crate-metadata.json] -insert_final_newline = unset diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md index d7a0a69f..e3f6b58c 100644 --- a/.github/CONTRIBUTING.md +++ b/.github/CONTRIBUTING.md @@ -71,7 +71,7 @@ If you wish to contribute a new step, please use the following coding standards: 5. Add any new parameters to `nextflow_schema.json` with help text (via the `nf-core pipelines schema build` tool). 6. Add sanity checks and validation for all relevant parameters. 7. Perform local tests to validate that the new code works as expected. -8. If applicable, add a new test command in `.github/workflow/ci.yml`. +8. If applicable, add a new test in the `tests` directory. ### Default values diff --git a/.github/actions/get-shards/action.yml b/.github/actions/get-shards/action.yml new file mode 100644 index 00000000..34085279 --- /dev/null +++ b/.github/actions/get-shards/action.yml @@ -0,0 +1,69 @@ +name: "Get number of shards" +description: "Get the number of nf-test shards for the current CI job" +inputs: + max_shards: + description: "Maximum number of shards allowed" + required: true + paths: + description: "Component paths to test" + required: false + tags: + description: "Tags to pass as argument for nf-test --tag parameter" + required: false +outputs: + shard: + description: "Array of shard numbers" + value: ${{ steps.shards.outputs.shard }} + total_shards: + description: "Total number of shards" + value: ${{ steps.shards.outputs.total_shards }} +runs: + using: "composite" + steps: + - name: Install nf-test + uses: nf-core/setup-nf-test@v1 + with: + version: ${{ env.NFT_VER }} + - name: Get number of shards + id: shards + shell: bash + run: | + # Run nf-test with dynamic parameter + nftest_output=$(nf-test test \ + --profile +docker \ + $(if [ -n "${{ inputs.tags }}" ]; then echo "--tag ${{ inputs.tags }}"; fi) \ + --dry-run \ + --ci \ + --changed-since HEAD^) || { + echo "nf-test command failed with exit code $?" + echo "Full output: $nftest_output" + exit 1 + } + echo "nf-test dry-run output: $nftest_output" + + # Default values for shard and total_shards + shard="[]" + total_shards=0 + + # Check if there are related tests + if echo "$nftest_output" | grep -q 'No tests to execute'; then + echo "No related tests found." + else + # Extract the number of related tests + number_of_shards=$(echo "$nftest_output" | sed -n 's|.*Executed \([0-9]*\) tests.*|\1|p') + if [[ -n "$number_of_shards" && "$number_of_shards" -gt 0 ]]; then + shards_to_run=$(( $number_of_shards < ${{ inputs.max_shards }} ? $number_of_shards : ${{ inputs.max_shards }} )) + shard=$(seq 1 "$shards_to_run" | jq -R . | jq -c -s .) + total_shards="$shards_to_run" + else + echo "Unexpected output format. Falling back to default values." + fi + fi + + # Write to GitHub Actions outputs + echo "shard=$shard" >> $GITHUB_OUTPUT + echo "total_shards=$total_shards" >> $GITHUB_OUTPUT + + # Debugging output + echo "Final shard array: $shard" + echo "Total number of shards: $total_shards" diff --git a/.github/actions/nf-test/action.yml b/.github/actions/nf-test/action.yml new file mode 100644 index 00000000..bf44d961 --- /dev/null +++ b/.github/actions/nf-test/action.yml @@ -0,0 +1,109 @@ +name: "nf-test Action" +description: "Runs nf-test with common setup steps" +inputs: + profile: + description: "Profile to use" + required: true + shard: + description: "Shard number for this CI job" + required: true + total_shards: + description: "Total number of test shards(NOT the total number of matrix jobs)" + required: true + paths: + description: "Test paths" + required: true + tags: + description: "Tags to pass as argument for nf-test --tag parameter" + required: false +runs: + using: "composite" + steps: + - name: Setup Nextflow + uses: nf-core/setup-nextflow@v2 + with: + version: "${{ env.NXF_VERSION }}" + + - name: Set up Python + uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5 + with: + python-version: "3.13" + + - name: Install nf-test + uses: nf-core/setup-nf-test@v1 + with: + version: "${{ env.NFT_VER }}" + install-pdiff: true + + - name: Setup apptainer + if: contains(inputs.profile, 'singularity') + uses: eWaterCycle/setup-apptainer@main + + - name: Set up Singularity + if: contains(inputs.profile, 'singularity') + shell: bash + run: | + mkdir -p $NXF_SINGULARITY_CACHEDIR + mkdir -p $NXF_SINGULARITY_LIBRARYDIR + + - name: Conda setup + if: contains(inputs.profile, 'conda') + uses: conda-incubator/setup-miniconda@505e6394dae86d6a5c7fbb6e3fb8938e3e863830 # v3 + with: + auto-update-conda: true + conda-solver: libmamba + conda-remove-defaults: true + + - name: Run nf-test + shell: bash + env: + NFT_WORKDIR: ${{ env.NFT_WORKDIR }} + run: | + nf-test test \ + --profile=+${{ inputs.profile }} \ + $(if [ -n "${{ inputs.tags }}" ]; then echo "--tag ${{ inputs.tags }}"; fi) \ + --ci \ + --changed-since HEAD^ \ + --verbose \ + --tap=test.tap \ + --shard ${{ inputs.shard }}/${{ inputs.total_shards }} + + # Save the absolute path of the test.tap file to the output + echo "tap_file_path=$(realpath test.tap)" >> $GITHUB_OUTPUT + + - name: Generate test summary + if: always() + shell: bash + run: | + # Add header if it doesn't exist (using a token file to track this) + if [ ! -f ".summary_header" ]; then + echo "# 🚀 nf-test results" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "| Status | Test Name | Profile | Shard |" >> $GITHUB_STEP_SUMMARY + echo "|:------:|-----------|---------|-------|" >> $GITHUB_STEP_SUMMARY + touch .summary_header + fi + + if [ -f test.tap ]; then + while IFS= read -r line; do + if [[ $line =~ ^ok ]]; then + test_name="${line#ok }" + # Remove the test number from the beginning + test_name="${test_name#* }" + echo "| ✅ | ${test_name} | ${{ inputs.profile }} | ${{ inputs.shard }}/${{ inputs.total_shards }} |" >> $GITHUB_STEP_SUMMARY + elif [[ $line =~ ^not\ ok ]]; then + test_name="${line#not ok }" + # Remove the test number from the beginning + test_name="${test_name#* }" + echo "| ❌ | ${test_name} | ${{ inputs.profile }} | ${{ inputs.shard }}/${{ inputs.total_shards }} |" >> $GITHUB_STEP_SUMMARY + fi + done < test.tap + else + echo "| ⚠️ | No test results found | ${{ inputs.profile }} | ${{ inputs.shard }}/${{ inputs.total_shards }} |" >> $GITHUB_STEP_SUMMARY + fi + + - name: Clean up + if: always() + shell: bash + run: | + sudo rm -rf /home/ubuntu/tests/ diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml deleted file mode 100644 index 44033727..00000000 --- a/.github/workflows/ci.yml +++ /dev/null @@ -1,88 +0,0 @@ -name: nf-core CI -# This workflow runs the pipeline with the minimal test dataset to check that it completes without any syntax errors -on: - push: - branches: - - dev - pull_request: - release: - types: [published] - workflow_dispatch: - -env: - NXF_ANSI_LOG: false - NXF_SINGULARITY_CACHEDIR: ${{ github.workspace }}/.singularity - NXF_SINGULARITY_LIBRARYDIR: ${{ github.workspace }}/.singularity - -concurrency: - group: "${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}" - cancel-in-progress: true - -jobs: - test: - name: "Run pipeline with test data (${{ matrix.NXF_VER }} | ${{ matrix.test_name }} | ${{ matrix.profile }})" - # Only run on push if this is the nf-core dev branch (merged PRs) - if: "${{ github.event_name != 'push' || (github.event_name == 'push' && github.repository == 'sanger-tol/curationpretext') }}" - runs-on: ubuntu-latest - strategy: - matrix: - NXF_VER: - - "24.04.2" - - "latest-everything" - profile: - - "conda" - - "docker" - - "singularity" - test_name: - - "test" - isMaster: - - ${{ github.base_ref == 'master' }} - # Exclude conda and singularity on dev - exclude: - - isMaster: false - profile: "conda" - - isMaster: false - profile: "singularity" - steps: - - name: Check out pipeline code - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4 - with: - fetch-depth: 0 - - - name: Set up Nextflow - uses: nf-core/setup-nextflow@v2 - with: - version: "${{ matrix.NXF_VER }}" - - - name: Set up Apptainer - if: matrix.profile == 'singularity' - uses: eWaterCycle/setup-apptainer@main - - - name: Set up Singularity - if: matrix.profile == 'singularity' - run: | - mkdir -p $NXF_SINGULARITY_CACHEDIR - mkdir -p $NXF_SINGULARITY_LIBRARYDIR - - - name: Set up Miniconda - if: matrix.profile == 'conda' - uses: conda-incubator/setup-miniconda@a4260408e20b96e80095f42ff7f1a15b27dd94ca # v3 - with: - miniconda-version: "latest" - auto-update-conda: true - conda-solver: libmamba - channels: conda-forge,bioconda - - - name: Set up Conda - if: matrix.profile == 'conda' - run: | - echo $(realpath $CONDA)/condabin >> $GITHUB_PATH - echo $(realpath python) >> $GITHUB_PATH - - - name: Clean up Disk space - uses: jlumbroso/free-disk-space@54081f138730dfa15788a46383842cd2f914a1be # v1.3.1 - - - name: "Run pipeline with test data ${{ matrix.NXF_VER }} | ${{ matrix.test_name }} | ${{ matrix.profile }}" - continue-on-error: ${{ matrix.NXF_VER == 'latest-everything' }} - run: | - nextflow run ${GITHUB_WORKSPACE} -profile ${{ matrix.test_name }},${{ matrix.profile }} --outdir ./results diff --git a/.github/workflows/clean-up.yml b/.github/workflows/clean-up.yml index 0b6b1f27..ac030fd5 100644 --- a/.github/workflows/clean-up.yml +++ b/.github/workflows/clean-up.yml @@ -10,7 +10,7 @@ jobs: issues: write pull-requests: write steps: - - uses: actions/stale@28ca1036281a5e5922ead5184a1bbf96e5fc984e # v9 + - uses: actions/stale@5bef64f19d7facfb25b37b414482c7164d639639 # v9 with: stale-issue-message: "This issue has been tagged as awaiting-changes or awaiting-feedback by an nf-core contributor. Remove stale label or add a comment otherwise this issue will be closed in 20 days." stale-pr-message: "This PR has been tagged as awaiting-changes or awaiting-feedback by an nf-core contributor. Remove stale label or add a comment if it is still useful." diff --git a/.github/workflows/download_pipeline.yml b/.github/workflows/download_pipeline.yml index ab06316e..999bcc38 100644 --- a/.github/workflows/download_pipeline.yml +++ b/.github/workflows/download_pipeline.yml @@ -12,14 +12,6 @@ on: required: true default: "dev" pull_request: - types: - - opened - - edited - - synchronize - branches: - - main - - master - pull_request_target: branches: - main - master @@ -52,9 +44,9 @@ jobs: - name: Disk space cleanup uses: jlumbroso/free-disk-space@54081f138730dfa15788a46383842cd2f914a1be # v1.3.1 - - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5 + - uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5 with: - python-version: "3.12" + python-version: "3.13" architecture: "x64" - name: Setup Apptainer @@ -120,6 +112,7 @@ jobs: echo "IMAGE_COUNT_AFTER=$image_count" >> "$GITHUB_OUTPUT" - name: Compare container image counts + id: count_comparison run: | if [ "${{ steps.count_initial.outputs.IMAGE_COUNT_INITIAL }}" -ne "${{ steps.count_afterwards.outputs.IMAGE_COUNT_AFTER }}" ]; then initial_count=${{ steps.count_initial.outputs.IMAGE_COUNT_INITIAL }} @@ -132,3 +125,10 @@ jobs: else echo "The pipeline can be downloaded successfully!" fi + + - name: Upload Nextflow logfile for debugging purposes + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4 + with: + name: nextflow_logfile.txt + path: .nextflow.log* + include-hidden-files: true diff --git a/.github/workflows/fix-linting.yml b/.github/workflows/fix_linting.yml similarity index 96% rename from .github/workflows/fix-linting.yml rename to .github/workflows/fix_linting.yml index 94c929ba..1c97b461 100644 --- a/.github/workflows/fix-linting.yml +++ b/.github/workflows/fix_linting.yml @@ -32,9 +32,9 @@ jobs: GITHUB_TOKEN: ${{ secrets.nf_core_bot_auth_token }} # Install and run pre-commit - - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5 + - uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5 with: - python-version: "3.12" + python-version: "3.13" - name: Install pre-commit run: pip install pre-commit diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml index dbd52d5a..8b0f88c3 100644 --- a/.github/workflows/linting.yml +++ b/.github/workflows/linting.yml @@ -3,9 +3,6 @@ name: nf-core linting # It runs the `nf-core pipelines lint` and markdown lint tests to ensure # that the code meets the nf-core guidelines. on: - push: - branches: - - dev pull_request: release: types: [published] @@ -16,10 +13,10 @@ jobs: steps: - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4 - - name: Set up Python 3.12 - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5 + - name: Set up Python 3.13 + uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5 with: - python-version: "3.12" + python-version: "3.13" - name: Install pre-commit run: pip install pre-commit @@ -36,13 +33,13 @@ jobs: - name: Install Nextflow uses: nf-core/setup-nextflow@v2 - - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5 + - uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5 with: - python-version: "3.12" + python-version: "3.13" architecture: "x64" - name: read .nf-core.yml - uses: pietrobolcato/action-read-yaml@1.1.0 + uses: pietrobolcato/action-read-yaml@9f13718d61111b69f30ab4ac683e67a56d254e1d # 1.1.0 id: read_yml with: config: ${{ github.workspace }}/.nf-core.yml @@ -74,7 +71,7 @@ jobs: - name: Upload linting log file artifact if: ${{ always() }} - uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4 + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4 with: name: linting-logs path: | diff --git a/.github/workflows/linting_comment.yml b/.github/workflows/linting_comment.yml index 95b6b6af..d43797d9 100644 --- a/.github/workflows/linting_comment.yml +++ b/.github/workflows/linting_comment.yml @@ -11,7 +11,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Download lint results - uses: dawidd6/action-download-artifact@20319c5641d495c8a52e688b7dc5fada6c3a9fbc # v8 + uses: dawidd6/action-download-artifact@ac66b43f0e6a346234dd65d4d0c8fbb31cb316e5 # v11 with: workflow: linting.yml workflow_conclusion: completed @@ -21,7 +21,7 @@ jobs: run: echo "pr_number=$(cat linting-logs/PR_number.txt)" >> $GITHUB_OUTPUT - name: Post PR comment - uses: marocchino/sticky-pull-request-comment@331f8f5b4215f0445d3c07b4967662a32a2d3e31 # v2 + uses: marocchino/sticky-pull-request-comment@52423e01640425a022ef5fd42c6fb5f633a02728 # v2 with: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} number: ${{ steps.pr_number.outputs.pr_number }} diff --git a/.github/workflows/nf-test.yml b/.github/workflows/nf-test.yml new file mode 100644 index 00000000..593c9360 --- /dev/null +++ b/.github/workflows/nf-test.yml @@ -0,0 +1,140 @@ +name: Run nf-test +on: + pull_request: + paths-ignore: + - "docs/**" + - "**/meta.yml" + - "**/*.md" + - "**/*.png" + - "**/*.svg" + release: + types: [published] + workflow_dispatch: + +# Cancel if a newer run is started +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + NFT_VER: "0.9.2" + NFT_WORKDIR: "~" + NXF_ANSI_LOG: false + NXF_SINGULARITY_CACHEDIR: ${{ github.workspace }}/.singularity + NXF_SINGULARITY_LIBRARYDIR: ${{ github.workspace }}/.singularity + +jobs: + nf-test-changes: + name: nf-test-changes + runs-on: # use GitHub runners + - "ubuntu-latest" + outputs: + shard: ${{ steps.set-shards.outputs.shard }} + total_shards: ${{ steps.set-shards.outputs.total_shards }} + steps: + - name: Clean Workspace # Purge the workspace in case it's running on a self-hosted runner + run: | + ls -la ./ + rm -rf ./* || true + rm -rf ./.??* || true + ls -la ./ + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4 + with: + fetch-depth: 0 + + - name: get number of shards + id: set-shards + uses: ./.github/actions/get-shards + env: + NFT_VER: ${{ env.NFT_VER }} + with: + max_shards: 7 + + - name: debug + run: | + echo ${{ steps.set-shards.outputs.shard }} + echo ${{ steps.set-shards.outputs.total_shards }} + + nf-test: + name: "${{ matrix.profile }} | ${{ matrix.NXF_VER }} | ${{ matrix.shard }}/${{ needs.nf-test-changes.outputs.total_shards }}" + needs: [nf-test-changes] + if: ${{ needs.nf-test-changes.outputs.total_shards != '0' }} + runs-on: # use GitHub runners + - "ubuntu-latest" + strategy: + fail-fast: false + matrix: + shard: ${{ fromJson(needs.nf-test-changes.outputs.shard) }} + profile: [conda, docker, singularity] + isMain: + - ${{ github.base_ref == 'master' || github.base_ref == 'main' }} + # Exclude conda and singularity on dev + exclude: + - isMain: false + profile: "conda" + - isMain: false + profile: "singularity" + NXF_VER: + - "24.10.5" + - "latest-everything" + env: + NXF_ANSI_LOG: false + TOTAL_SHARDS: ${{ needs.nf-test-changes.outputs.total_shards }} + + steps: + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4 + with: + fetch-depth: 0 + + - name: Run nf-test + id: run_nf_test + uses: ./.github/actions/nf-test + continue-on-error: ${{ matrix.NXF_VER == 'latest-everything' }} + env: + NFT_WORKDIR: ${{ env.NFT_WORKDIR }} + with: + profile: ${{ matrix.profile }} + shard: ${{ matrix.shard }} + total_shards: ${{ env.TOTAL_SHARDS }} + + - name: Report test status + if: ${{ always() }} + run: | + if [[ "${{ steps.run_nf_test.outcome }}" == "failure" ]]; then + echo "::error::Test with ${{ matrix.NXF_VER }} failed" + # Add to workflow summary + echo "## ❌ Test failed: ${{ matrix.profile }} | ${{ matrix.NXF_VER }} | Shard ${{ matrix.shard }}/${{ env.TOTAL_SHARDS }}" >> $GITHUB_STEP_SUMMARY + if [[ "${{ matrix.NXF_VER }}" == "latest-everything" ]]; then + echo "::warning::Test with latest-everything failed but will not cause workflow failure. Please check if the error is expected or if it needs fixing." + fi + if [[ "${{ matrix.NXF_VER }}" != "latest-everything" ]]; then + exit 1 + fi + fi + + confirm-pass: + needs: [nf-test] + if: always() + runs-on: # use GitHub runners + - "ubuntu-latest" + steps: + - name: One or more tests failed (excluding latest-everything) + if: ${{ contains(needs.*.result, 'failure') }} + run: exit 1 + + - name: One or more tests cancelled + if: ${{ contains(needs.*.result, 'cancelled') }} + run: exit 1 + + - name: All tests ok + if: ${{ contains(needs.*.result, 'success') }} + run: exit 0 + + - name: debug-print + if: always() + run: | + echo "::group::DEBUG: `needs` Contents" + echo "DEBUG: toJSON(needs) = ${{ toJSON(needs) }}" + echo "DEBUG: toJSON(needs.*.result) = ${{ toJSON(needs.*.result) }}" + echo "::endgroup::" diff --git a/.github/workflows/template_version_comment.yml b/.github/workflows/template-version-comment.yml similarity index 95% rename from .github/workflows/template_version_comment.yml rename to .github/workflows/template-version-comment.yml index 537529bc..beb5c77f 100644 --- a/.github/workflows/template_version_comment.yml +++ b/.github/workflows/template-version-comment.yml @@ -14,7 +14,7 @@ jobs: ref: ${{ github.event.pull_request.head.sha }} - name: Read template version from .nf-core.yml - uses: nichmor/minimal-read-yaml@v0.0.2 + uses: nichmor/minimal-read-yaml@1f7205277e25e156e1f63815781db80a6d490b8f # v0.0.2 id: read_yml with: config: ${{ github.workspace }}/.nf-core.yml diff --git a/.nf-core.yml b/.nf-core.yml index 259a3724..485f5487 100644 --- a/.nf-core.yml +++ b/.nf-core.yml @@ -30,7 +30,7 @@ lint: nextflow_config: - manifest.name - manifest.homePage -nf_core_version: 3.2.1 +nf_core_version: 3.3.2 repository_type: pipeline template: author: Damon-Lee B Pointon (@DLBPointon) @@ -48,4 +48,4 @@ template: - seqera_platform - multiqc - rocrate - version: 1.4.1 + version: 1.4.2 diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 1dec8650..bb41beec 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -4,10 +4,24 @@ repos: hooks: - id: prettier additional_dependencies: - - prettier@3.2.5 - - - repo: https://github.com/editorconfig-checker/editorconfig-checker.python - rev: "3.1.2" + - prettier@3.6.2 + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v5.0.0 hooks: - - id: editorconfig-checker - alias: ec + - id: trailing-whitespace + args: [--markdown-linebreak-ext=md] + exclude: | + (?x)^( + .*ro-crate-metadata.json$| + modules/nf-core/.*| + subworkflows/nf-core/.*| + .*\.snap$ + )$ + - id: end-of-file-fixer + exclude: | + (?x)^( + .*ro-crate-metadata.json$| + modules/nf-core/.*| + subworkflows/nf-core/.*| + .*\.snap$ + )$ diff --git a/.prettierrc.yml b/.prettierrc.yml index c81f9a76..07dbd8bb 100644 --- a/.prettierrc.yml +++ b/.prettierrc.yml @@ -1 +1,6 @@ printWidth: 120 +tabWidth: 4 +overrides: + - files: "*.{md,yml,yaml,html,css,scss,js,cff}" + options: + tabWidth: 2 diff --git a/CHANGELOG.md b/CHANGELOG.md index bc3e350d..a38609fc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,7 +3,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## v1.4.1 - [date] +## v1.4.2 - [date] Initial release of sanger-tol/curationpretext, created with the [nf-core](https://nf-co.re/) template. diff --git a/README.md b/README.md index 86c881cb..3b3843c7 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,11 @@ # sanger-tol/curationpretext -[![GitHub Actions CI Status](https://github.com/sanger-tol/curationpretext/actions/workflows/ci.yml/badge.svg)](https://github.com/sanger-tol/curationpretext/actions/workflows/ci.yml) +[![GitHub Actions CI Status](https://github.com/sanger-tol/curationpretext/actions/workflows/nf-test.yml/badge.svg)](https://github.com/sanger-tol/curationpretext/actions/workflows/nf-test.yml) [![GitHub Actions Linting Status](https://github.com/sanger-tol/curationpretext/actions/workflows/linting.yml/badge.svg)](https://github.com/sanger-tol/curationpretext/actions/workflows/linting.yml)[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.XXXXXXX-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.XXXXXXX) [![nf-test](https://img.shields.io/badge/unit_tests-nf--test-337ab7.svg)](https://www.nf-test.com) -[![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-%E2%89%A524.04.2-23aa62.svg)](https://www.nextflow.io/) +[![Nextflow](https://img.shields.io/badge/version-%E2%89%A524.10.5-green?style=flat&logo=nextflow&logoColor=white&color=%230DC09D&link=https%3A%2F%2Fnextflow.io)](https://www.nextflow.io/) +[![nf-core template version](https://img.shields.io/badge/nf--core_template-3.3.2-green?style=flat&logo=nfcore&logoColor=white&color=%2324B064&link=https%3A%2F%2Fnf-co.re)](https://github.com/nf-core/tools/releases/tag/3.3.2) [![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda)](https://docs.conda.io/en/latest/) [![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/) [![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/) @@ -21,7 +22,7 @@ --> + workflows use the "tube map" design for that. See https://nf-co.re/docs/guidelines/graphic_design/workflow_diagrams#examples for examples. --> ## Usage diff --git a/assets/schema_input.json b/assets/schema_input.json index 7dea6e86..c21d7515 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -17,14 +17,14 @@ "type": "string", "format": "file-path", "exists": true, - "pattern": "^\\S+\\.f(ast)?q\\.gz$", + "pattern": "^([\\S\\s]*\\/)?[^\\s\\/]+\\.f(ast)?q\\.gz$", "errorMessage": "FastQ file for reads 1 must be provided, cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'" }, "fastq_2": { "type": "string", "format": "file-path", "exists": true, - "pattern": "^\\S+\\.f(ast)?q\\.gz$", + "pattern": "^([\\S\\s]*\\/)?[^\\s\\/]+\\.f(ast)?q\\.gz$", "errorMessage": "FastQ file for reads 2 cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'" } }, diff --git a/conf/base.config b/conf/base.config index 45a8a625..8370150f 100644 --- a/conf/base.config +++ b/conf/base.config @@ -15,7 +15,7 @@ process { memory = { 6.GB * task.attempt } time = { 4.h * task.attempt } - errorStrategy = { task.exitStatus in ((130..145) + 104) ? 'retry' : 'finish' } + errorStrategy = { task.exitStatus in ((130..145) + 104 + 175) ? 'retry' : 'finish' } maxRetries = 1 maxErrors = '-1' @@ -59,4 +59,8 @@ process { errorStrategy = 'retry' maxRetries = 2 } + withLabel: process_gpu { + ext.use_gpu = { workflow.profile.contains('gpu') } + accelerator = { workflow.profile.contains('gpu') ? 1 : null } + } } diff --git a/nextflow.config b/nextflow.config index db46f10c..396dc039 100644 --- a/nextflow.config +++ b/nextflow.config @@ -148,16 +148,25 @@ profiles { ] } } + gpu { + docker.runOptions = '-u $(id -u):$(id -g) --gpus all' + apptainer.runOptions = '--nv' + singularity.runOptions = '--nv' + } test { includeConfig 'conf/test.config' } test_full { includeConfig 'conf/test_full.config' } } -// Load nf-core custom profiles from different Institutions -includeConfig !System.getenv('NXF_OFFLINE') && params.custom_config_base ? "${params.custom_config_base}/nfcore_custom.config" : "/dev/null" +// Load nf-core custom profiles from different institutions + +// If params.custom_config_base is set AND either the NXF_OFFLINE environment variable is not set or params.custom_config_base is a local path, the nfcore_custom.config file from the specified base path is included. +// Load sanger-tol/curationpretext custom profiles from different institutions. +includeConfig params.custom_config_base && (!System.getenv('NXF_OFFLINE') || !params.custom_config_base.startsWith('http')) ? "${params.custom_config_base}/nfcore_custom.config" : "/dev/null" + // Load sanger-tol/curationpretext custom profiles from different institutions. // TODO nf-core: Optionally, you can add a pipeline-specific nf-core config at https://github.com/nf-core/configs -// includeConfig !System.getenv('NXF_OFFLINE') && params.custom_config_base ? "${params.custom_config_base}/pipeline/curationpretext.config" : "/dev/null" +// includeConfig params.custom_config_base && (!System.getenv('NXF_OFFLINE') || !params.custom_config_base.startsWith('http')) ? "${params.custom_config_base}/pipeline/curationpretext.config" : "/dev/null" // Set default registry for Apptainer, Docker, Podman, Charliecloud and Singularity independent of -profile // Will not be used unless Apptainer / Docker / Podman / Charliecloud / Singularity are enabled @@ -213,7 +222,6 @@ dag { manifest { name = 'sanger-tol/curationpretext' - author = """Damon-Lee B Pointon (@DLBPointon)""" // The author field is deprecated from Nextflow version 24.10.0, use contributors instead contributors = [ // TODO nf-core: Update the field with the details of the contributors to your pipeline. New with Nextflow version 24.10.0 [ @@ -229,14 +237,14 @@ manifest { description = """A simple pipeline to generate pretext files for genomic curation.""" mainScript = 'main.nf' defaultBranch = 'master' - nextflowVersion = '!>=24.04.2' - version = '1.4.1' + nextflowVersion = '!>=24.10.5' + version = '1.4.2' doi = '' } // Nextflow plugins plugins { - id 'nf-schema@2.2.0' // Validation of pipeline parameters and creation of an input channel from a sample sheet + id 'nf-schema@2.4.2' // Validation of pipeline parameters and creation of an input channel from a sample sheet } validation { diff --git a/nf-test.config b/nf-test.config new file mode 100644 index 00000000..3a1fff59 --- /dev/null +++ b/nf-test.config @@ -0,0 +1,24 @@ +config { + // location for all nf-test tests + testsDir "." + + // nf-test directory including temporary files for each test + workDir System.getenv("NFT_WORKDIR") ?: ".nf-test" + + // location of an optional nextflow.config file specific for executing tests + configFile "tests/nextflow.config" + + // ignore tests coming from the nf-core/modules repo + ignore 'modules/nf-core/**/tests/*', 'subworkflows/nf-core/**/tests/*' + + // run all test with defined profile(s) from the main nextflow.config + profile "test" + + // list of filenames or patterns that should be trigger a full test run + triggers 'nextflow.config', 'nf-test.config', 'conf/test.config', 'tests/nextflow.config', 'tests/.nftignore' + + // load the necessary plugins + plugins { + load "nft-utils@0.0.3" + } +} diff --git a/subworkflows/local/utils_nfcore_curationpretext_pipeline/main.nf b/subworkflows/local/utils_nfcore_curationpretext_pipeline/main.nf index aa3f2b7d..40153083 100644 --- a/subworkflows/local/utils_nfcore_curationpretext_pipeline/main.nf +++ b/subworkflows/local/utils_nfcore_curationpretext_pipeline/main.nf @@ -219,4 +219,3 @@ def methodsDescriptionText(mqc_methods_yaml) { return description_html.toString() } - diff --git a/subworkflows/nf-core/utils_nfschema_plugin/tests/nextflow.config b/subworkflows/nf-core/utils_nfschema_plugin/tests/nextflow.config index 0907ac58..09ef842a 100644 --- a/subworkflows/nf-core/utils_nfschema_plugin/tests/nextflow.config +++ b/subworkflows/nf-core/utils_nfschema_plugin/tests/nextflow.config @@ -1,5 +1,5 @@ plugins { - id "nf-schema@2.1.0" + id "nf-schema@2.4.2" } validation { diff --git a/tests/.nftignore b/tests/.nftignore new file mode 100644 index 00000000..73eb92f7 --- /dev/null +++ b/tests/.nftignore @@ -0,0 +1,2 @@ +.DS_Store +pipeline_info/*.{html,json,txt,yml} diff --git a/tests/default.nf.test b/tests/default.nf.test new file mode 100644 index 00000000..5c9f4cf2 --- /dev/null +++ b/tests/default.nf.test @@ -0,0 +1,35 @@ +nextflow_pipeline { + + name "Test pipeline" + script "../main.nf" + tag "pipeline" + + test("-profile test") { + + when { + params { + outdir = "$outputDir" + } + } + + then { + // stable_name: All files + folders in ${params.outdir}/ with a stable name + def stable_name = getAllFilesFromDir(params.outdir, relative: true, includeDir: true, ignore: ['pipeline_info/*.{html,json,txt}']) + // stable_path: All files in ${params.outdir}/ with stable content + def stable_path = getAllFilesFromDir(params.outdir, ignoreFile: 'tests/.nftignore') + assertAll( + { assert workflow.success}, + { assert snapshot( + // Number of successful tasks + workflow.trace.succeeded().size(), + // pipeline versions.yml file for multiqc from which Nextflow version is removed because we test pipelines on multiple Nextflow versions + removeNextflowVersion("$outputDir/pipeline_info/curationpretext_software_mqc_versions.yml"), + // All stable path name, with a relative path + stable_name, + // All files with stable contents + stable_path + ).match() } + ) + } + } +} diff --git a/tests/nextflow.config b/tests/nextflow.config new file mode 100644 index 00000000..e3be3550 --- /dev/null +++ b/tests/nextflow.config @@ -0,0 +1,14 @@ +/* +======================================================================================== + Nextflow config file for running nf-test tests +======================================================================================== +*/ + +// TODO nf-core: Specify any additional parameters here +// Or any resources requirements +params { + modules_testdata_base_path = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/' + pipelines_testdata_base_path = 'https://raw.githubusercontent.com/nf-core/test-datasets/refs/heads/curationpretext' +} + +aws.client.anonymous = true // fixes S3 access issues on self-hosted runners From 84d9bb96df56c9df8042e70c7b59c1ad02a554a8 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Thu, 28 Aug 2025 17:04:48 +0100 Subject: [PATCH 37/58] Template Update 332 --- tests/default.nf.test.snap | 171 +++++++++++++++++++++++++++++++++++++ 1 file changed, 171 insertions(+) create mode 100644 tests/default.nf.test.snap diff --git a/tests/default.nf.test.snap b/tests/default.nf.test.snap new file mode 100644 index 00000000..7815abe3 --- /dev/null +++ b/tests/default.nf.test.snap @@ -0,0 +1,171 @@ +{ + "Full run": { + "content": [ + 40, + { + "BEDTOOLS_BAMTOBED": { + "bedtools": "2.31.1" + }, + "BEDTOOLS_GENOMECOV": { + "bedtools": "2.31.1" + }, + "BEDTOOLS_INTERSECT": { + "bedtools": "2.31.1" + }, + "BEDTOOLS_MAKEWINDOWS": { + "bedtools": "2.31.1" + }, + "BEDTOOLS_MAP": { + "bedtools": "2.31.1" + }, + "BWAMEM2_INDEX": { + "bwamem2": "2.2.1" + }, + "CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT": { + "samtools": 1.17, + "bwa-mem2": "2.2.1", + "staden_io_lib": "1.14.14" + }, + "CRAM_GENERATE_CSV": { + "samtools": 1.17 + }, + "EXTRACT_REPEAT": { + "perl": "(v5.26.2))", + "extract_repeat.pl": 1.0 + }, + "EXTRACT_TELOMERE": { + "extract_telomere": 2.0, + "coreutils": 9.1 + }, + "FIND_TELOMERE_REGIONS": { + "find_telomere": 1.0 + }, + "FIND_TELOMERE_WINDOWS": { + "telomere": 1.0 + }, + "GAWK_CLEAN_TELOMERE": { + "gawk": "5.3.0" + }, + "GAWK_GAP_LENGTH": { + "gawk": "5.3.0" + }, + "GAWK_GENERATE_GENOME_FILE": { + "gawk": "5.3.0" + }, + "GAWK_REFORMAT_INTERSECT": { + "gawk": "5.3.0" + }, + "GAWK_RENAME_IDS": { + "gawk": "5.3.0" + }, + "GAWK_REPLACE_DOTS": { + "gawk": "5.3.0" + }, + "GAWK_UPPER_SEQUENCE": { + "gawk": "5.3.0" + }, + "GET_LARGEST_SCAFFOLD": { + "get_largest_scaffold": 2.0, + "coreutils": 9.1 + }, + "GNU_SORT": { + "coreutils": 9.3 + }, + "GNU_SORT_A": { + "coreutils": 9.3 + }, + "GNU_SORT_B": { + "coreutils": 9.3 + }, + "GNU_SORT_C": { + "coreutils": 9.3 + }, + "GRAPH_OVERALL_COVERAGE": { + "perl": "(v5.26.2))", + "graph_overall_coverage.pl": 1.0 + }, + "MINIMAP2_ALIGN": { + "minimap2": "2.28-r1209", + "samtools": 1.2 + }, + "PRETEXTMAP_STANDRD": { + "pretextmap": "0.1.9", + "samtools": 1.21 + }, + "PRETEXT_INGEST_SNDRD": { + "PretextGraph": "0.0.9", + "PretextMap": "0.1.9", + "bigWigToBedGraph": 447 + }, + "SAMTOOLS_FAIDX": { + "samtools": 1.21 + }, + "SAMTOOLS_MERGE": { + "samtools": 1.21 + }, + "SAMTOOLS_SORT": { + "samtools": 1.21 + }, + "SAMTOOLS_VIEW_FILTER_PRIMARY": { + "samtools": 1.21 + }, + "SEQTK_CUTN": { + "seqtk": "1.4-r122" + }, + "SNAPSHOT_SRES": { + "pretextsnapshot": "0.0.4" + }, + "UCSC_BEDGRAPHTOBIGWIG": { + "ucsc": 469 + }, + "WINDOWMASKER_MKCOUNTS": { + "windowmasker": "1.0.0" + }, + "WINDOWMASKER_USTAT": { + "windowmasker": "1.0.0" + }, + "Workflow": { + "sanger-tol/curationpretext": "v1.4.2" + } + }, + [ + "accessory_files", + "accessory_files/CurationPretextTest.bigWig", + "accessory_files/CurationPretextTest.gap.bedgraph", + "accessory_files/CurationPretextTest_telomere.bed", + "accessory_files/CurationPretextTest_telomere.bedgraph", + "accessory_files/coverage.bigWig", + "pipeline_info", + "pipeline_info/sanger-tol_curationpretext_software_versions.yml", + "pretext_maps_processed", + "pretext_maps_processed/CurationPretextTest_normal.pretext", + "pretext_maps_raw", + "pretext_maps_raw/CurationPretextTest_normal_pi.pretext", + "pretext_snapshot", + "pretext_snapshot/CurationPretextTest_normalFullMap.png" + ], + 14, + [ + "CurationPretextTest.bigWig:md5,3f66a9152d793a62f877b733c2336dfd", + "CurationPretextTest.gap.bedgraph:md5,d41d8cd98f00b204e9800998ecf8427e", + "CurationPretextTest_telomere.bed:md5,d41d8cd98f00b204e9800998ecf8427e", + "CurationPretextTest_telomere.bedgraph:md5,d41d8cd98f00b204e9800998ecf8427e", + "coverage.bigWig:md5,2e474506c957152b231ac63c859f0b17" + ], + 5, + 1, + false, + true, + 1, + false, + true, + 1, + true + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.04.4" + }, + "timestamp": "2025-04-16T11:23:34.556355" + } +} From 0832a1a24c5ca0e55c2c853b23e5f5600b1d7a68 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Fri, 29 Aug 2025 09:36:39 +0100 Subject: [PATCH 38/58] updated ro-crate --- ro-crate-metadata.json | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/ro-crate-metadata.json b/ro-crate-metadata.json index 50ea4f24..61ee727e 100644 --- a/ro-crate-metadata.json +++ b/ro-crate-metadata.json @@ -23,7 +23,7 @@ "@type": "Dataset", "creativeWorkStatus": "Stable", "datePublished": "2025-05-27T09:34:43+00:00", - "description": "# ![sanger-tol/curationpretext](docs/images/curationpretext-light.png#gh-light-mode-only) ![sanger-tol/curationpretext](docs/images/curationpretext-dark.png#gh-dark-mode-only)\n\n[![GitHub Actions CI Status](https://github.com/sanger-tol/curationpretext/workflows/nf-core%20CI/badge.svg)](https://github.com/sanger-tol/curationpretext/actions?query=workflow%3A%22nf-core+CI%22)\n[![GitHub Actions Linting Status](https://github.com/sanger-tol/curationpretext/workflows/nf-core%20linting/badge.svg)](https://github.com/sanger-tol/curationpretext/actions?query=workflow%3A%22nf-core+linting%22)[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.12773958-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.12773958)\n\n[![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-%E2%89%A524.04.2-23aa62.svg)](https://www.nextflow.io/)\n[![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda)](https://docs.conda.io/en/latest/)\n[![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/)\n[![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/)\n[![Launch on Seqera Platform](https://img.shields.io/badge/Launch%20%F0%9F%9A%80-Seqera%20Platform-%234256e7)](https://cloud.seqera.io/launch?pipeline=https://github.com/sanger-tol/curationpretext)\n\n## Introduction\n\n**sanger-tol/curationpretext** is a bioinformatics pipeline typically used in conjunction with [TreeVal](https://github.com/sanger-tol/treeval) to generate pretext maps (and optionally telomeric, gap, coverage, and repeat density plots which can be ingested into pretext) for the manual curation of high quality genomes.\n\nThis is intended as a supplementary pipeline for the [treeval](https://github.com/sanger-tol/treeval) project. This pipeline can be simply used to generate pretext maps, information on how to run this pipeline can be found in the [usage documentation](https://pipelines.tol.sanger.ac.uk/curationpretext/usage).\n\n![Workflow Diagram](./docs/images/CurationPretext_1_3_0.png)\n\n1. Generate Maps - Generates pretext maps as well as a static image.\n\n2. Accessory files - Generates the repeat density, gap, telomere, and coverage tracks.\n\n## Usage\n\n> [!NOTE]\n> If you are new to Nextflow and nf-core, please refer to [this page](https://pipelines.tol.sanger.ac.uk/curationpretext/1.4.0/usage) on how to set-up Nextflow. Make sure to [test your setup](https://pipelines.tol.sanger.ac.uk/curationpretext/1.4.0/usage) with `-profile test` before running the workflow on actual data.\n\nCurrently, the pipeline uses the following flags:\n\n- `--input`\n\n - The absolute path to the assembled genome in, e.g., `/path/to/assembly.fa`\n\n- `--reads`\n\n - The directory of the fasta files generated from longread reads, e.g., `/path/to/fasta/`\n\n- `--read_type`\n\n - The type of longread data you are utilising, e.g., ont, illumina, hifi.\n\n- `--aligner`\n\n - The aligner yopu wish to use for the coverage generation, defaults to bwamem2 but minimap2 is also supported.\n\n- `--cram`\n\n - The directory of the cram _and_ cram.crai files, e.g., `/path/to/cram/`\n\n- `--map_order`\n\n - hic map scaffold order, input either `length` or `unsorted`\n\n- `--teloseq`\n\n - A telomeric sequence, e.g., `TTAGGG`\n\n- `--all_output`\n\n - An option to output all maps + accessory files, the default will only output the pretextmaps where ingestion has occured.\n\nNow, you can run the pipeline using:\n\n```bash\nnextflow run sanger-tol/curationpretext \\\n --input { input.fasta } \\\n --cram { path/to/cram/ } \\\n --reads { path/to/longread/fasta/ } \\\n --read_type { default is \"hifi\" }\n --sample { default is \"pretext_rerun\" } \\\n --teloseq { default is \"TTAGGG\" } \\\n --map_order { default is \"unsorted\" } \\\n --all_output \\\n --outdir { OUTDIR } \\\n -profile \n\n```\n\n> **Warning:**\n> Please provide pipeline parameters via the CLI or Nextflow `-params-file` option. Custom config files including those\n> provided by the `-c` Nextflow option can be used to provide any configuration _**except for parameters**_;\n\nFor more details, please refer to the [usage documentation](https://pipelines.tol.sanger.ac.uk/curationpretext/usage) and the [parameter documentation](https://pipelines.tol.sanger.ac.uk/curationpretext/parameters).\n\n## Pipeline output\n\nTo see the the results of a test run with a full size dataset refer to the [results](https://pipelines.tol.sanger.ac.uk/curationpretext/results) tab on the sanger-tol/curationpretext website pipeline page.\nFor more details about the output files and reports, please refer to the\n[output documentation](https://pipelines.tol.sanger.ac.uk/curationpretext/output).\n\n## Credits\n\nsanger-tol/curationpretext was originally written by Damon-Lee B Pointon (@DLBPointon).\n\nWe thank the following people for their extensive assistance in the development of this pipeline:\n\n- @muffato - For reviews.\n\n- @yumisims - TreeVal and Software.\n\n- @weaglesBio - TreeVal and Software.\n\n- @josieparis - Help with better docs and testing.\n\n- @mahesh-panchal - Large support with 1.2.0 in making the pipeline more robust with other HPC environments.\n\n- @GRIT - For feedback and feature requests.\n\n- @prototaxites - Support with 1.3.0 and showing me the power of GAWK.\n\n## Contributions and Support\n\nIf you would like to contribute to this pipeline, please see the [contributing guidelines](.github/CONTRIBUTING.md).\n\n## Citations\n\nIf you use sanger-tol/curationpretext for your analysis, please cite it using the following doi: [10.5281/zenodo.12773958](https://doi.org/10.5281/zenodo.12773958)\n\nAn extensive list of references for the tools used by the pipeline can be found in the [`CITATIONS.md`](CITATIONS.md) file.\n\nThis pipeline uses code and infrastructure developed and maintained by the [nf-core](https://nf-co.re) community, reused here under the [MIT license](https://github.com/nf-core/tools/blob/main/LICENSE).\n\n> **The nf-core framework for community-curated bioinformatics pipelines.**\n>\n> Philip Ewels, Alexander Peltzer, Sven Fillinger, Harshil Patel, Johannes Alneberg, Andreas Wilm, Maxime Ulysse Garcia, Paolo Di Tommaso & Sven Nahnsen.\n>\n> _Nat Biotechnol._ 2020 Feb 13. doi: [10.1038/s41587-020-0439-x](https://dx.doi.org/10.1038/s41587-020-0439-x).\n", + "description": "# ![sanger-tol/curationpretext](docs/images/curationpretext-light.png#gh-light-mode-only) ![sanger-tol/curationpretext](docs/images/curationpretext-dark.png#gh-dark-mode-only)\n\n[![GitHub Actions CI Status](https://github.com/sanger-tol/curationpretext/actions/workflows/nf-test.yml/badge.svg)](https://github.com/sanger-tol/curationpretext/actions/workflows/nf-test.yml)\n[![GitHub Actions Linting Status](https://github.com/sanger-tol/curationpretext/actions/workflows/linting.yml/badge.svg)](https://github.com/sanger-tol/curationpretext/actions/workflows/linting.yml)[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.12773958-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.12773958)\n[![nf-test](https://img.shields.io/badge/unit_tests-nf--test-337ab7.svg)](https://www.nf-test.com)\n\n[![Nextflow](https://img.shields.io/badge/version-%E2%89%A524.10.5-green?style=flat&logo=nextflow&logoColor=white&color=%230DC09D&link=https%3A%2F%2Fnextflow.io)](https://www.nextflow.io/)\n[![nf-core template version](https://img.shields.io/badge/nf--core_template-3.3.2-green?style=flat&logo=nfcore&logoColor=white&color=%2324B064&link=https%3A%2F%2Fnf-co.re)](https://github.com/nf-core/tools/releases/tag/3.3.2)\n[![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda)](https://docs.conda.io/en/latest/)\n[![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/)\n[![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/)\n[![Launch on Seqera Platform](https://img.shields.io/badge/Launch%20%F0%9F%9A%80-Seqera%20Platform-%234256e7)](https://cloud.seqera.io/launch?pipeline=https://github.com/sanger-tol/curationpretext)\n\n## Introduction\n\n**sanger-tol/curationpretext** is a bioinformatics pipeline typically used in conjunction with [TreeVal](https://github.com/sanger-tol/treeval) to generate pretext maps (and optionally telomeric, gap, coverage, and repeat density plots which can be ingested into pretext) for the manual curation of high quality genomes.\n\nThis is intended as a supplementary pipeline for the [treeval](https://github.com/sanger-tol/treeval) project. This pipeline can be simply used to generate pretext maps, information on how to run this pipeline can be found in the [usage documentation](https://pipelines.tol.sanger.ac.uk/curationpretext/usage).\n\n![Workflow Diagram](./docs/images/CurationPretext_1_3_0.png)\n\n1. Generate Maps - Generates pretext maps as well as a static image.\n\n2. Accessory files - Generates the repeat density, gap, telomere, and coverage tracks.\n\n## Usage\n\n> [!NOTE]\n> If you are new to Nextflow and nf-core, please refer to [this page](https://nf-co.re/docs/usage/installation) on how to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline) with `-profile test` before running the workflow on actual data.\n\nCurrently, the pipeline uses the following flags:\n\n- `--input`\n\n - The absolute path to the assembled genome in, e.g., `/path/to/assembly.fa`\n\n- `--reads`\n\n - The directory of the fasta files generated from longread reads, e.g., `/path/to/fasta/`\n\n- `--read_type`\n\n - The type of longread data you are utilising, e.g., ont, illumina, hifi.\n\n- `--aligner`\n\n - The aligner yopu wish to use for the coverage generation, defaults to bwamem2 but minimap2 is also supported.\n\n- `--cram`\n\n - The directory of the cram _and_ cram.crai files, e.g., `/path/to/cram/`\n\n- `--map_order`\n\n - hic map scaffold order, input either `length` or `unsorted`\n\n- `--teloseq`\n\n - A telomeric sequence, e.g., `TTAGGG`\n\n- `--all_output`\n\n - An option to output all maps + accessory files, the default will only output the pretextmaps where ingestion has occured.\n\nNow, you can run the pipeline using:\n\n```bash\nnextflow run sanger-tol/curationpretext \\\n --input { input.fasta } \\\n --cram { path/to/cram/ } \\\n --reads { path/to/longread/fasta/ } \\\n --read_type { default is \"hifi\" }\n --sample { default is \"pretext_rerun\" } \\\n --teloseq { default is \"TTAGGG\" } \\\n --map_order { default is \"unsorted\" } \\\n --all_output \\\n --outdir { OUTDIR } \\\n -profile \n\n```\n\n> **Warning:**\n> Please provide pipeline parameters via the CLI or Nextflow `-params-file` option. Custom config files including those\n> provided by the `-c` Nextflow option can be used to provide any configuration _**except for parameters**_;\n\nFor more details, please refer to the [usage documentation](https://pipelines.tol.sanger.ac.uk/curationpretext/usage) and the [parameter documentation](https://pipelines.tol.sanger.ac.uk/curationpretext/parameters).\n\n## Pipeline output\n\nTo see the the results of a test run with a full size dataset refer to the [results](https://pipelines.tol.sanger.ac.uk/curationpretext/results) tab on the sanger-tol/curationpretext website pipeline page.\nFor more details about the output files and reports, please refer to the\n[output documentation](https://pipelines.tol.sanger.ac.uk/curationpretext/output).\n\n## Credits\n\nsanger-tol/curationpretext was originally written by Damon-Lee B Pointon (@DLBPointon).\n\nWe thank the following people for their extensive assistance in the development of this pipeline:\n\n- @muffato - For reviews.\n\n- @yumisims - TreeVal and Software.\n\n- @weaglesBio - TreeVal and Software.\n\n- @josieparis - Help with better docs and testing.\n\n- @mahesh-panchal - Large support with 1.2.0 in making the pipeline more robust with other HPC environments.\n\n- @GRIT - For feedback and feature requests.\n\n- @prototaxites - Support with 1.3.0 and showing me the power of GAWK.\n\n## Contributions and Support\n\nIf you would like to contribute to this pipeline, please see the [contributing guidelines](.github/CONTRIBUTING.md).\n\n## Citations\n\nIf you use sanger-tol/curationpretext for your analysis, please cite it using the following doi: [10.5281/zenodo.12773958](https://doi.org/10.5281/zenodo.12773958)\n\nAn extensive list of references for the tools used by the pipeline can be found in the [`CITATIONS.md`](CITATIONS.md) file.\n\nThis pipeline uses code and infrastructure developed and maintained by the [nf-core](https://nf-co.re) community, reused here under the [MIT license](https://github.com/nf-core/tools/blob/main/LICENSE).\n\n> **The nf-core framework for community-curated bioinformatics pipelines.**\n>\n> Philip Ewels, Alexander Peltzer, Sven Fillinger, Harshil Patel, Johannes Alneberg, Andreas Wilm, Maxime Ulysse Garcia, Paolo Di Tommaso & Sven Nahnsen.\n>\n> _Nat Biotechnol._ 2020 Feb 13. doi: [10.1038/s41587-020-0439-x](https://dx.doi.org/10.1038/s41587-020-0439-x).\n", "hasPart": [ { "@id": "main.nf" @@ -124,7 +124,11 @@ }, { "@id": "main.nf", - "@type": ["File", "SoftwareSourceCode", "ComputationalWorkflow"], + "@type": [ + "File", + "SoftwareSourceCode", + "ComputationalWorkflow" + ], "creator": [ { "@id": "https://orcid.org/0000-0002-7860-3560" @@ -133,9 +137,16 @@ "dateCreated": "", "dateModified": "2025-05-27T10:34:43Z", "dct:conformsTo": "https://bioschemas.org/profiles/ComputationalWorkflow/1.0-RELEASE/", - "keywords": ["nf-core", "nextflow"], - "license": ["MIT"], - "name": ["sanger-tol/curationpretext"], + "keywords": [ + "nf-core", + "nextflow" + ], + "license": [ + "MIT" + ], + "name": [ + "sanger-tol/curationpretext" + ], "programmingLanguage": { "@id": "https://w3id.org/workflowhub/workflow-ro-crate#nextflow" }, @@ -146,7 +157,9 @@ "https://github.com/sanger-tol/curationpretext", "https://pipelines.tol.sanger.ac.uk/sanger-tol/curationpretext/1.4.2/" ], - "version": ["1.4.2"] + "version": [ + "1.4.2" + ] }, { "@id": "https://w3id.org/workflowhub/workflow-ro-crate#nextflow", @@ -332,4 +345,4 @@ "name": "Josie Paris" } ] -} +} \ No newline at end of file From 7d8c1390428bb8edf6b24ecc3f5c06fd03a78b1c Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Fri, 29 Aug 2025 09:41:36 +0100 Subject: [PATCH 39/58] Pre-Commit Prettier --- CHANGELOG.md | 2 -- README.md | 8 -------- 2 files changed, 10 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e7f8b932..fe9a0345 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -126,11 +126,9 @@ Note, since the pipeline is using Nextflow DSL2, each process will be run with i ### Added and Fixed - GRIT found a bug in `pretext_graph` ingestion code where null values were being introduced as the track name - - This has now need hardcoded, there was no need for dynamic naming anyway - GRIT found a bug in `pretext_graph` ingestion where gap and telomere tracks stopped being ingested correctly and would no longer display or be zeroed out. - - I'm not entirely sure of the cause of this but i think it is a mix of how pretext handles unnamed tracks, assuming their datatype so a null named gap track would be treated as a repeat track, and incorrect logic in the pretext_graph module. - Added GAWK module (as GAWK_CLEAN_TELOMERE) to remove "you screwed up" (this is a legacy error message which will be changed to something more informative and professional) error lines which can appear with some telo motifs or lower case motifs. These will otherwise cause the FIND_TELOMERE_WINDOWS process to crash. diff --git a/README.md b/README.md index 74b503de..d68799fe 100644 --- a/README.md +++ b/README.md @@ -31,35 +31,27 @@ This is intended as a supplementary pipeline for the [treeval](https://github.co Currently, the pipeline uses the following flags: - `--input` - - The absolute path to the assembled genome in, e.g., `/path/to/assembly.fa` - `--reads` - - The directory of the fasta files generated from longread reads, e.g., `/path/to/fasta/` - `--read_type` - - The type of longread data you are utilising, e.g., ont, illumina, hifi. - `--aligner` - - The aligner yopu wish to use for the coverage generation, defaults to bwamem2 but minimap2 is also supported. - `--cram` - - The directory of the cram _and_ cram.crai files, e.g., `/path/to/cram/` - `--map_order` - - hic map scaffold order, input either `length` or `unsorted` - `--teloseq` - - A telomeric sequence, e.g., `TTAGGG` - `--all_output` - - An option to output all maps + accessory files, the default will only output the pretextmaps where ingestion has occured. Now, you can run the pipeline using: From cebb3ca789cbe7447acb01f7617313b6194810fb Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Fri, 29 Aug 2025 09:43:05 +0100 Subject: [PATCH 40/58] Pre-Commit Prettier --- ro-crate-metadata.json | 25 ++++++------------------- 1 file changed, 6 insertions(+), 19 deletions(-) diff --git a/ro-crate-metadata.json b/ro-crate-metadata.json index 61ee727e..86ee1f8c 100644 --- a/ro-crate-metadata.json +++ b/ro-crate-metadata.json @@ -124,11 +124,7 @@ }, { "@id": "main.nf", - "@type": [ - "File", - "SoftwareSourceCode", - "ComputationalWorkflow" - ], + "@type": ["File", "SoftwareSourceCode", "ComputationalWorkflow"], "creator": [ { "@id": "https://orcid.org/0000-0002-7860-3560" @@ -137,16 +133,9 @@ "dateCreated": "", "dateModified": "2025-05-27T10:34:43Z", "dct:conformsTo": "https://bioschemas.org/profiles/ComputationalWorkflow/1.0-RELEASE/", - "keywords": [ - "nf-core", - "nextflow" - ], - "license": [ - "MIT" - ], - "name": [ - "sanger-tol/curationpretext" - ], + "keywords": ["nf-core", "nextflow"], + "license": ["MIT"], + "name": ["sanger-tol/curationpretext"], "programmingLanguage": { "@id": "https://w3id.org/workflowhub/workflow-ro-crate#nextflow" }, @@ -157,9 +146,7 @@ "https://github.com/sanger-tol/curationpretext", "https://pipelines.tol.sanger.ac.uk/sanger-tol/curationpretext/1.4.2/" ], - "version": [ - "1.4.2" - ] + "version": ["1.4.2"] }, { "@id": "https://w3id.org/workflowhub/workflow-ro-crate#nextflow", @@ -345,4 +332,4 @@ "name": "Josie Paris" } ] -} \ No newline at end of file +} From 78075a28b0d6b4c5ec113ed154529abb62b6c979 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Fri, 29 Aug 2025 11:09:49 +0100 Subject: [PATCH 41/58] Another rocrate update --- ro-crate-metadata.json | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/ro-crate-metadata.json b/ro-crate-metadata.json index 86ee1f8c..05ff8efd 100644 --- a/ro-crate-metadata.json +++ b/ro-crate-metadata.json @@ -23,7 +23,7 @@ "@type": "Dataset", "creativeWorkStatus": "Stable", "datePublished": "2025-05-27T09:34:43+00:00", - "description": "# ![sanger-tol/curationpretext](docs/images/curationpretext-light.png#gh-light-mode-only) ![sanger-tol/curationpretext](docs/images/curationpretext-dark.png#gh-dark-mode-only)\n\n[![GitHub Actions CI Status](https://github.com/sanger-tol/curationpretext/actions/workflows/nf-test.yml/badge.svg)](https://github.com/sanger-tol/curationpretext/actions/workflows/nf-test.yml)\n[![GitHub Actions Linting Status](https://github.com/sanger-tol/curationpretext/actions/workflows/linting.yml/badge.svg)](https://github.com/sanger-tol/curationpretext/actions/workflows/linting.yml)[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.12773958-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.12773958)\n[![nf-test](https://img.shields.io/badge/unit_tests-nf--test-337ab7.svg)](https://www.nf-test.com)\n\n[![Nextflow](https://img.shields.io/badge/version-%E2%89%A524.10.5-green?style=flat&logo=nextflow&logoColor=white&color=%230DC09D&link=https%3A%2F%2Fnextflow.io)](https://www.nextflow.io/)\n[![nf-core template version](https://img.shields.io/badge/nf--core_template-3.3.2-green?style=flat&logo=nfcore&logoColor=white&color=%2324B064&link=https%3A%2F%2Fnf-co.re)](https://github.com/nf-core/tools/releases/tag/3.3.2)\n[![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda)](https://docs.conda.io/en/latest/)\n[![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/)\n[![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/)\n[![Launch on Seqera Platform](https://img.shields.io/badge/Launch%20%F0%9F%9A%80-Seqera%20Platform-%234256e7)](https://cloud.seqera.io/launch?pipeline=https://github.com/sanger-tol/curationpretext)\n\n## Introduction\n\n**sanger-tol/curationpretext** is a bioinformatics pipeline typically used in conjunction with [TreeVal](https://github.com/sanger-tol/treeval) to generate pretext maps (and optionally telomeric, gap, coverage, and repeat density plots which can be ingested into pretext) for the manual curation of high quality genomes.\n\nThis is intended as a supplementary pipeline for the [treeval](https://github.com/sanger-tol/treeval) project. This pipeline can be simply used to generate pretext maps, information on how to run this pipeline can be found in the [usage documentation](https://pipelines.tol.sanger.ac.uk/curationpretext/usage).\n\n![Workflow Diagram](./docs/images/CurationPretext_1_3_0.png)\n\n1. Generate Maps - Generates pretext maps as well as a static image.\n\n2. Accessory files - Generates the repeat density, gap, telomere, and coverage tracks.\n\n## Usage\n\n> [!NOTE]\n> If you are new to Nextflow and nf-core, please refer to [this page](https://nf-co.re/docs/usage/installation) on how to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline) with `-profile test` before running the workflow on actual data.\n\nCurrently, the pipeline uses the following flags:\n\n- `--input`\n\n - The absolute path to the assembled genome in, e.g., `/path/to/assembly.fa`\n\n- `--reads`\n\n - The directory of the fasta files generated from longread reads, e.g., `/path/to/fasta/`\n\n- `--read_type`\n\n - The type of longread data you are utilising, e.g., ont, illumina, hifi.\n\n- `--aligner`\n\n - The aligner yopu wish to use for the coverage generation, defaults to bwamem2 but minimap2 is also supported.\n\n- `--cram`\n\n - The directory of the cram _and_ cram.crai files, e.g., `/path/to/cram/`\n\n- `--map_order`\n\n - hic map scaffold order, input either `length` or `unsorted`\n\n- `--teloseq`\n\n - A telomeric sequence, e.g., `TTAGGG`\n\n- `--all_output`\n\n - An option to output all maps + accessory files, the default will only output the pretextmaps where ingestion has occured.\n\nNow, you can run the pipeline using:\n\n```bash\nnextflow run sanger-tol/curationpretext \\\n --input { input.fasta } \\\n --cram { path/to/cram/ } \\\n --reads { path/to/longread/fasta/ } \\\n --read_type { default is \"hifi\" }\n --sample { default is \"pretext_rerun\" } \\\n --teloseq { default is \"TTAGGG\" } \\\n --map_order { default is \"unsorted\" } \\\n --all_output \\\n --outdir { OUTDIR } \\\n -profile \n\n```\n\n> **Warning:**\n> Please provide pipeline parameters via the CLI or Nextflow `-params-file` option. Custom config files including those\n> provided by the `-c` Nextflow option can be used to provide any configuration _**except for parameters**_;\n\nFor more details, please refer to the [usage documentation](https://pipelines.tol.sanger.ac.uk/curationpretext/usage) and the [parameter documentation](https://pipelines.tol.sanger.ac.uk/curationpretext/parameters).\n\n## Pipeline output\n\nTo see the the results of a test run with a full size dataset refer to the [results](https://pipelines.tol.sanger.ac.uk/curationpretext/results) tab on the sanger-tol/curationpretext website pipeline page.\nFor more details about the output files and reports, please refer to the\n[output documentation](https://pipelines.tol.sanger.ac.uk/curationpretext/output).\n\n## Credits\n\nsanger-tol/curationpretext was originally written by Damon-Lee B Pointon (@DLBPointon).\n\nWe thank the following people for their extensive assistance in the development of this pipeline:\n\n- @muffato - For reviews.\n\n- @yumisims - TreeVal and Software.\n\n- @weaglesBio - TreeVal and Software.\n\n- @josieparis - Help with better docs and testing.\n\n- @mahesh-panchal - Large support with 1.2.0 in making the pipeline more robust with other HPC environments.\n\n- @GRIT - For feedback and feature requests.\n\n- @prototaxites - Support with 1.3.0 and showing me the power of GAWK.\n\n## Contributions and Support\n\nIf you would like to contribute to this pipeline, please see the [contributing guidelines](.github/CONTRIBUTING.md).\n\n## Citations\n\nIf you use sanger-tol/curationpretext for your analysis, please cite it using the following doi: [10.5281/zenodo.12773958](https://doi.org/10.5281/zenodo.12773958)\n\nAn extensive list of references for the tools used by the pipeline can be found in the [`CITATIONS.md`](CITATIONS.md) file.\n\nThis pipeline uses code and infrastructure developed and maintained by the [nf-core](https://nf-co.re) community, reused here under the [MIT license](https://github.com/nf-core/tools/blob/main/LICENSE).\n\n> **The nf-core framework for community-curated bioinformatics pipelines.**\n>\n> Philip Ewels, Alexander Peltzer, Sven Fillinger, Harshil Patel, Johannes Alneberg, Andreas Wilm, Maxime Ulysse Garcia, Paolo Di Tommaso & Sven Nahnsen.\n>\n> _Nat Biotechnol._ 2020 Feb 13. doi: [10.1038/s41587-020-0439-x](https://dx.doi.org/10.1038/s41587-020-0439-x).\n", + "description": "# ![sanger-tol/curationpretext](docs/images/curationpretext-light.png#gh-light-mode-only) ![sanger-tol/curationpretext](docs/images/curationpretext-dark.png#gh-dark-mode-only)\n\n[![GitHub Actions CI Status](https://github.com/sanger-tol/curationpretext/actions/workflows/nf-test.yml/badge.svg)](https://github.com/sanger-tol/curationpretext/actions/workflows/nf-test.yml)\n[![GitHub Actions Linting Status](https://github.com/sanger-tol/curationpretext/actions/workflows/linting.yml/badge.svg)](https://github.com/sanger-tol/curationpretext/actions/workflows/linting.yml)[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.12773958-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.12773958)\n[![nf-test](https://img.shields.io/badge/unit_tests-nf--test-337ab7.svg)](https://www.nf-test.com)\n\n[![Nextflow](https://img.shields.io/badge/version-%E2%89%A524.10.5-green?style=flat&logo=nextflow&logoColor=white&color=%230DC09D&link=https%3A%2F%2Fnextflow.io)](https://www.nextflow.io/)\n[![nf-core template version](https://img.shields.io/badge/nf--core_template-3.3.2-green?style=flat&logo=nfcore&logoColor=white&color=%2324B064&link=https%3A%2F%2Fnf-co.re)](https://github.com/nf-core/tools/releases/tag/3.3.2)\n[![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda)](https://docs.conda.io/en/latest/)\n[![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/)\n[![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/)\n[![Launch on Seqera Platform](https://img.shields.io/badge/Launch%20%F0%9F%9A%80-Seqera%20Platform-%234256e7)](https://cloud.seqera.io/launch?pipeline=https://github.com/sanger-tol/curationpretext)\n\n## Introduction\n\n**sanger-tol/curationpretext** is a bioinformatics pipeline typically used in conjunction with [TreeVal](https://github.com/sanger-tol/treeval) to generate pretext maps (and optionally telomeric, gap, coverage, and repeat density plots which can be ingested into pretext) for the manual curation of high quality genomes.\n\nThis is intended as a supplementary pipeline for the [treeval](https://github.com/sanger-tol/treeval) project. This pipeline can be simply used to generate pretext maps, information on how to run this pipeline can be found in the [usage documentation](https://pipelines.tol.sanger.ac.uk/curationpretext/usage).\n\n![Workflow Diagram](./docs/images/CurationPretext_1_3_0.png)\n\n1. Generate Maps - Generates pretext maps as well as a static image.\n\n2. Accessory files - Generates the repeat density, gap, telomere, and coverage tracks.\n\n## Usage\n\n> [!NOTE]\n> If you are new to Nextflow and nf-core, please refer to [this page](https://nf-co.re/docs/usage/installation) on how to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline) with `-profile test` before running the workflow on actual data.\n\nCurrently, the pipeline uses the following flags:\n\n- `--input`\n - The absolute path to the assembled genome in, e.g., `/path/to/assembly.fa`\n\n- `--reads`\n - The directory of the fasta files generated from longread reads, e.g., `/path/to/fasta/`\n\n- `--read_type`\n - The type of longread data you are utilising, e.g., ont, illumina, hifi.\n\n- `--aligner`\n - The aligner yopu wish to use for the coverage generation, defaults to bwamem2 but minimap2 is also supported.\n\n- `--cram`\n - The directory of the cram _and_ cram.crai files, e.g., `/path/to/cram/`\n\n- `--map_order`\n - hic map scaffold order, input either `length` or `unsorted`\n\n- `--teloseq`\n - A telomeric sequence, e.g., `TTAGGG`\n\n- `--all_output`\n - An option to output all maps + accessory files, the default will only output the pretextmaps where ingestion has occured.\n\nNow, you can run the pipeline using:\n\n```bash\nnextflow run sanger-tol/curationpretext \\\n --input { input.fasta } \\\n --cram { path/to/cram/ } \\\n --reads { path/to/longread/fasta/ } \\\n --read_type { default is \"hifi\" }\n --sample { default is \"pretext_rerun\" } \\\n --teloseq { default is \"TTAGGG\" } \\\n --map_order { default is \"unsorted\" } \\\n --all_output \\\n --outdir { OUTDIR } \\\n -profile \n\n```\n\n> **Warning:**\n> Please provide pipeline parameters via the CLI or Nextflow `-params-file` option. Custom config files including those\n> provided by the `-c` Nextflow option can be used to provide any configuration _**except for parameters**_;\n\nFor more details, please refer to the [usage documentation](https://pipelines.tol.sanger.ac.uk/curationpretext/usage) and the [parameter documentation](https://pipelines.tol.sanger.ac.uk/curationpretext/parameters).\n\n## Pipeline output\n\nTo see the the results of a test run with a full size dataset refer to the [results](https://pipelines.tol.sanger.ac.uk/curationpretext/results) tab on the sanger-tol/curationpretext website pipeline page.\nFor more details about the output files and reports, please refer to the\n[output documentation](https://pipelines.tol.sanger.ac.uk/curationpretext/output).\n\n## Credits\n\nsanger-tol/curationpretext was originally written by Damon-Lee B Pointon (@DLBPointon).\n\nWe thank the following people for their extensive assistance in the development of this pipeline:\n\n- @muffato - For reviews.\n\n- @yumisims - TreeVal and Software.\n\n- @weaglesBio - TreeVal and Software.\n\n- @josieparis - Help with better docs and testing.\n\n- @mahesh-panchal - Large support with 1.2.0 in making the pipeline more robust with other HPC environments.\n\n- @GRIT - For feedback and feature requests.\n\n- @prototaxites - Support with 1.3.0 and showing me the power of GAWK.\n\n## Contributions and Support\n\nIf you would like to contribute to this pipeline, please see the [contributing guidelines](.github/CONTRIBUTING.md).\n\n## Citations\n\nIf you use sanger-tol/curationpretext for your analysis, please cite it using the following doi: [10.5281/zenodo.12773958](https://doi.org/10.5281/zenodo.12773958)\n\nAn extensive list of references for the tools used by the pipeline can be found in the [`CITATIONS.md`](CITATIONS.md) file.\n\nThis pipeline uses code and infrastructure developed and maintained by the [nf-core](https://nf-co.re) community, reused here under the [MIT license](https://github.com/nf-core/tools/blob/main/LICENSE).\n\n> **The nf-core framework for community-curated bioinformatics pipelines.**\n>\n> Philip Ewels, Alexander Peltzer, Sven Fillinger, Harshil Patel, Johannes Alneberg, Andreas Wilm, Maxime Ulysse Garcia, Paolo Di Tommaso & Sven Nahnsen.\n>\n> _Nat Biotechnol._ 2020 Feb 13. doi: [10.1038/s41587-020-0439-x](https://dx.doi.org/10.1038/s41587-020-0439-x).\n", "hasPart": [ { "@id": "main.nf" @@ -124,7 +124,11 @@ }, { "@id": "main.nf", - "@type": ["File", "SoftwareSourceCode", "ComputationalWorkflow"], + "@type": [ + "File", + "SoftwareSourceCode", + "ComputationalWorkflow" + ], "creator": [ { "@id": "https://orcid.org/0000-0002-7860-3560" @@ -133,9 +137,16 @@ "dateCreated": "", "dateModified": "2025-05-27T10:34:43Z", "dct:conformsTo": "https://bioschemas.org/profiles/ComputationalWorkflow/1.0-RELEASE/", - "keywords": ["nf-core", "nextflow"], - "license": ["MIT"], - "name": ["sanger-tol/curationpretext"], + "keywords": [ + "nf-core", + "nextflow" + ], + "license": [ + "MIT" + ], + "name": [ + "sanger-tol/curationpretext" + ], "programmingLanguage": { "@id": "https://w3id.org/workflowhub/workflow-ro-crate#nextflow" }, @@ -146,7 +157,9 @@ "https://github.com/sanger-tol/curationpretext", "https://pipelines.tol.sanger.ac.uk/sanger-tol/curationpretext/1.4.2/" ], - "version": ["1.4.2"] + "version": [ + "1.4.2" + ] }, { "@id": "https://w3id.org/workflowhub/workflow-ro-crate#nextflow", @@ -332,4 +345,4 @@ "name": "Josie Paris" } ] -} +} \ No newline at end of file From 521e2d609667b1e6cf3abf10d3556040683606e6 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Fri, 29 Aug 2025 11:10:48 +0100 Subject: [PATCH 42/58] Another rocrate update --- ro-crate-metadata.json | 25 ++++++------------------- 1 file changed, 6 insertions(+), 19 deletions(-) diff --git a/ro-crate-metadata.json b/ro-crate-metadata.json index 05ff8efd..fd7ddd4a 100644 --- a/ro-crate-metadata.json +++ b/ro-crate-metadata.json @@ -124,11 +124,7 @@ }, { "@id": "main.nf", - "@type": [ - "File", - "SoftwareSourceCode", - "ComputationalWorkflow" - ], + "@type": ["File", "SoftwareSourceCode", "ComputationalWorkflow"], "creator": [ { "@id": "https://orcid.org/0000-0002-7860-3560" @@ -137,16 +133,9 @@ "dateCreated": "", "dateModified": "2025-05-27T10:34:43Z", "dct:conformsTo": "https://bioschemas.org/profiles/ComputationalWorkflow/1.0-RELEASE/", - "keywords": [ - "nf-core", - "nextflow" - ], - "license": [ - "MIT" - ], - "name": [ - "sanger-tol/curationpretext" - ], + "keywords": ["nf-core", "nextflow"], + "license": ["MIT"], + "name": ["sanger-tol/curationpretext"], "programmingLanguage": { "@id": "https://w3id.org/workflowhub/workflow-ro-crate#nextflow" }, @@ -157,9 +146,7 @@ "https://github.com/sanger-tol/curationpretext", "https://pipelines.tol.sanger.ac.uk/sanger-tol/curationpretext/1.4.2/" ], - "version": [ - "1.4.2" - ] + "version": ["1.4.2"] }, { "@id": "https://w3id.org/workflowhub/workflow-ro-crate#nextflow", @@ -345,4 +332,4 @@ "name": "Josie Paris" } ] -} \ No newline at end of file +} From 26162d766d8a880481b69d4a39a8d94f103955d8 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Mon, 15 Sep 2025 10:44:11 +0100 Subject: [PATCH 43/58] Update to resources --- conf/base.config | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/conf/base.config b/conf/base.config index 84aed456..16ee6a42 100644 --- a/conf/base.config +++ b/conf/base.config @@ -31,12 +31,12 @@ process { withName: CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT { cpus = { 16 } - memory = { 1.GB * ( reference.size() < 2e9 ? 80 : Math.ceil( ( reference.size() / 1e+9 ) * 30 ) * Math.ceil( task.attempt * 1 ) ) } + memory = { 1.GB * ( reference.size() < 2e9 ? 80 : Math.ceil( ( reference.size() / 1e+9 ) * 50 ) * Math.ceil( task.attempt * 1 ) ) } } withName: CRAM_FILTER_MINIMAP2_FILTER5END_FIXMATE_SORT { cpus = { 16 } - memory = { 1.GB * ( reference.size() < 2e9 ? 50 : Math.ceil( ( reference.size() / 1e+9 ) * 3 ) * Math.ceil( task.attempt * 1 ) ) } + memory = { 1.GB * ( reference.size() < 2e9 ? 50 : Math.ceil( ( reference.size() / 1e+9 ) * 8 ) * Math.ceil( task.attempt * 1 ) ) } } withName: PRETEXT_GRAPH { From e196eaa2bf9026accd5003d39a54828e6d573a43 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Mon, 15 Sep 2025 10:44:37 +0100 Subject: [PATCH 44/58] Update naming of tracks to match what pretextview expects --- modules/local/pretext/graph/main.nf | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/modules/local/pretext/graph/main.nf b/modules/local/pretext/graph/main.nf index 49bec139..df351d1f 100644 --- a/modules/local/pretext/graph/main.nf +++ b/modules/local/pretext/graph/main.nf @@ -30,6 +30,20 @@ process PRETEXT_GRAPH { def UCSC_VERSION = '447' // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions. // Using single [ ] as nextflow will use sh where possible not bash + // + // Core Args must match the below (taken from PretextView), this allows + // the use of keyboard shortcuts for main tracks: + // + // data_type_dic{ // use this data_type + // {"default", 0, }, + // {"repeat_density", 1}, + // {"gap", 2}, + // {"coverage", 3}, + // {"coverage_avg", 4}, + // {"telomere", 5}, + // {"not_weighted", 6} + // }; + // """ echo "PROCESSING ESSENTIAL FILES" @@ -87,7 +101,9 @@ process PRETEXT_GRAPH { if [ -s "\$file_og" ]; then echo "Processing OG_TELOMERE file: \$file_og" - PretextGraph $args -i "\$input_file" -n "og_telomere" -o telo_0.pretext < "\$file_og" + + # Must be named "telomere" + PretextGraph $args -i "\$input_file" -n "telomere" -o telo_0.pretext < "\$file_og" else echo "OG TELOMERE file - Could be empty or missing" cp "\$input_file" telo_0.pretext From e102e0f4250ba504bad3d2ec4e659cb1b8d622af Mon Sep 17 00:00:00 2001 From: Damon-Lee Pointon <51855558+DLBPointon@users.noreply.github.com> Date: Tue, 16 Sep 2025 10:37:32 +0100 Subject: [PATCH 45/58] Update nextflow.config Correction from @muffato Co-authored-by: Matthieu Muffato --- nextflow.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow.config b/nextflow.config index c8f7f654..cd1f3d60 100644 --- a/nextflow.config +++ b/nextflow.config @@ -265,7 +265,7 @@ manifest { homePage = 'https://github.com/sanger-tol/curationpretext' description = """A simple pipeline to generate pretext files for genomic curation.""" mainScript = 'main.nf' - defaultBranch = 'master' + defaultBranch = 'main' nextflowVersion = '!>=24.10.5' version = '1.5.0' doi = '10.5281/zenodo.12773958' From bd428b344bbfdaf715f344509a81feb2a0eb08bd Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Tue, 16 Sep 2025 12:28:34 +0100 Subject: [PATCH 46/58] Add switch for large genomes to switch over to MINIMAP2 rather than end-user selection --- .../utils_nfcore_curationpretext_pipeline/main.nf | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/subworkflows/local/utils_nfcore_curationpretext_pipeline/main.nf b/subworkflows/local/utils_nfcore_curationpretext_pipeline/main.nf index c91cb7d3..8a3b4670 100644 --- a/subworkflows/local/utils_nfcore_curationpretext_pipeline/main.nf +++ b/subworkflows/local/utils_nfcore_curationpretext_pipeline/main.nf @@ -79,17 +79,24 @@ workflow PIPELINE_INITIALISATION { type: 'dir' ) - ch_reference = input_fasta.map { fasta -> + ch_reference = input_fasta.map { fasta -> + def fasta_size = fasta.size() + def selected_aligner = (params_aligner == "AUTO") ? + (fasta_size > 5e9 ? "minimap2" : "bwamem2") : + params.aligner + tuple( - [ id: params.sample, - aligner: params.aligner, + [ + id: params.sample, + aligner: selected_aligner, map_order: params.map_order, - ref_size: fasta.size(), + ref_size: fasta_size, ], fasta ) } + ch_cram_reads = cram_dir.map { dir -> tuple( [ id: params.sample ], From 512ac147e39939c661078d4165dbadc6a1e4539a Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Tue, 16 Sep 2025 12:28:52 +0100 Subject: [PATCH 47/58] Adding AUTO as option --- nextflow.config | 2 +- nextflow_schema.json | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/nextflow.config b/nextflow.config index cd1f3d60..b86dba7b 100644 --- a/nextflow.config +++ b/nextflow.config @@ -17,7 +17,7 @@ params { teloseq = "TTAGGG" reads = null cram = null - aligner = "bwamem2" + aligner = "AUTO" read_type = "hifi" map_order = "unsorted" all_output = false diff --git a/nextflow_schema.json b/nextflow_schema.json index b3d71f02..06b76aca 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -69,9 +69,9 @@ "aligner": { "type": "string", "description": "Aligner for use {minimap2, bwamem2} in generating map", - "help_text": "Pick between {minimap2, bwamem2}. Defaults to 'minimap2'", + "help_text": "Pick between {minimap2, bwamem2, AUTO}. Defaults to 'minimap2'", "fa_icon": "fas fa-file-signature", - "enum": ["bwamem2", "minimap2"] + "enum": ["bwamem2", "minimap2", "AUTO"] }, "run_hires": { "type": "boolean", From d7e6c97c932f973cd82ffcfd1cd7efe061d57c56 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Tue, 16 Sep 2025 12:29:28 +0100 Subject: [PATCH 48/58] TreeVal Parity for the resource configs --- conf/base.config | 37 ++++++++++++++++++++++++++++--------- 1 file changed, 28 insertions(+), 9 deletions(-) diff --git a/conf/base.config b/conf/base.config index 16ee6a42..9ca32fe1 100644 --- a/conf/base.config +++ b/conf/base.config @@ -21,22 +21,41 @@ process { withName:SAMTOOLS_MERGE { cpus = { 16 } memory = { 50.GB * task.attempt } - time = { 20.h * task.attempt } + time = { 30.h * task.attempt } } withName: '.*:.*:LONGREAD_COVERAGE:(MINIMAP2_ALIGN|MINIMAP2_ALIGN_SPLIT)' { - cpus = { 16 } - memory = { 1.GB * ( reference.size() < 2e9 ? 50 : Math.ceil( ( reference.size() / 1e+9 ) * 20 ) * Math.ceil( task.attempt * 1 )) } + cpus = { 20 * 1 } + memory = { + 1.GB * ( + reference.size() < 2e9 ? 30 : + (reference.size() < 5e9 ? 40 : + (reference.size() < 10e9 ? 60 : + Math.ceil((reference.size() / 1e9) * 3) + ) + ) + ) * Math.ceil(task.attempt * 1) + } + time = { 1.h * ( reference.size() < 1e9 ? 10 : reference.size() < 10e9 ? 30 : 48) } } withName: CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT { - cpus = { 16 } - memory = { 1.GB * ( reference.size() < 2e9 ? 80 : Math.ceil( ( reference.size() / 1e+9 ) * 50 ) * Math.ceil( task.attempt * 1 ) ) } + cpus = { 16 * 1 } + memory = { 1.GB * ( reference.size() < 2e9 ? 80 : Math.ceil( ( reference.size() / 1e+9 ) * 30 ) * Math.ceil( task.attempt * 1 ) ) } } withName: CRAM_FILTER_MINIMAP2_FILTER5END_FIXMATE_SORT { - cpus = { 16 } - memory = { 1.GB * ( reference.size() < 2e9 ? 50 : Math.ceil( ( reference.size() / 1e+9 ) * 8 ) * Math.ceil( task.attempt * 1 ) ) } + cpus = { 16 * 1 } + memory = { + 1.GB * ( + reference.size() < 2e9 ? 30 : + (reference.size() < 5e9 ? 40 : + (reference.size() < 10e9 ? 60 : + Math.ceil((reference.size() / 1e9) * 3) + ) + ) + ) * Math.ceil(task.attempt * 1) + } } withName: PRETEXT_GRAPH { @@ -44,9 +63,9 @@ process { } withName: PRETEXTMAP_STANDRD{ - cpus = { 8 * task.attempt } + cpus = { 8 * 1 } memory = { 3.GB * task.attempt } - time = { 1.h * ( ( fasta.size() < 4e9 ? 24 : 48 ) * Math.ceil( task.attempt * 1 ) ) } + time = { 1.h * ( ( fasta.size() < 4e9 ? 24 : 48 ) * task.attempt ) } } withName: PRETEXTMAP_HIGHRES { From 08462959ec31fd396a9e1b5657d85a09f72b3d2e Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Tue, 16 Sep 2025 12:36:50 +0100 Subject: [PATCH 49/58] Fat finger spelling mistake --- .../local/utils_nfcore_curationpretext_pipeline/main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/subworkflows/local/utils_nfcore_curationpretext_pipeline/main.nf b/subworkflows/local/utils_nfcore_curationpretext_pipeline/main.nf index 8a3b4670..f0882ce6 100644 --- a/subworkflows/local/utils_nfcore_curationpretext_pipeline/main.nf +++ b/subworkflows/local/utils_nfcore_curationpretext_pipeline/main.nf @@ -81,7 +81,7 @@ workflow PIPELINE_INITIALISATION { ch_reference = input_fasta.map { fasta -> def fasta_size = fasta.size() - def selected_aligner = (params_aligner == "AUTO") ? + def selected_aligner = (params.aligner == "AUTO") ? (fasta_size > 5e9 ? "minimap2" : "bwamem2") : params.aligner From f799ad8a533f2aaedd79d9e633aded8d3a16a3ec Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Tue, 16 Sep 2025 12:51:10 +0100 Subject: [PATCH 50/58] Pipeline doesn't support conda --- .github/workflows/nf-test.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/nf-test.yml b/.github/workflows/nf-test.yml index 593c9360..f43d5b74 100644 --- a/.github/workflows/nf-test.yml +++ b/.github/workflows/nf-test.yml @@ -71,8 +71,8 @@ jobs: - ${{ github.base_ref == 'master' || github.base_ref == 'main' }} # Exclude conda and singularity on dev exclude: - - isMain: false - profile: "conda" + # - isMain: false + # profile: "conda" - isMain: false profile: "singularity" NXF_VER: From 48f7b5ce7d0530e3b0c19b0eaad51d4d15057045 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Tue, 16 Sep 2025 13:40:23 +0100 Subject: [PATCH 51/58] Update CHANGELOG --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index fe9a0345..901bff0c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -26,6 +26,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Added the `gawk_split_directions.awk` script for split telomere. - Addition of GUNZIP for the input reference genome. - Update tests. +- Added an "AUTO" value to the `--aligner` arg. If a genome is >5Gb it will use minimap2 else bwamem2. +- Parity update for the base.config to match TreeVal. ### Paramters From f39eecfb6ac189bee8c3bb4d618df2b24792ce23 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Tue, 16 Sep 2025 13:41:21 +0100 Subject: [PATCH 52/58] remove conda this time --- .github/workflows/nf-test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/nf-test.yml b/.github/workflows/nf-test.yml index f43d5b74..e113a611 100644 --- a/.github/workflows/nf-test.yml +++ b/.github/workflows/nf-test.yml @@ -66,7 +66,7 @@ jobs: fail-fast: false matrix: shard: ${{ fromJson(needs.nf-test-changes.outputs.shard) }} - profile: [conda, docker, singularity] + profile: [docker, singularity] isMain: - ${{ github.base_ref == 'master' || github.base_ref == 'main' }} # Exclude conda and singularity on dev From dfa8f0b01388fe3840de57afaaf635b26bfd147d Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Fri, 19 Sep 2025 11:45:02 +0100 Subject: [PATCH 53/58] Update some text --- README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/README.md b/README.md index d68799fe..8553501e 100644 --- a/README.md +++ b/README.md @@ -33,8 +33,12 @@ Currently, the pipeline uses the following flags: - `--input` - The absolute path to the assembled genome in, e.g., `/path/to/assembly.fa` +- `--sample` + - Sample is the naming prefix of the output files, e.g. iyTipFemo + - `--reads` - The directory of the fasta files generated from longread reads, e.g., `/path/to/fasta/` + - This folder *must* contain files in a `.fasta.gz` format, or they will be skipped by the internal file search function. - `--read_type` - The type of longread data you are utilising, e.g., ont, illumina, hifi. From 39d021ff1d6d9e7037c8894a3f2deddbaccc59b6 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Fri, 19 Sep 2025 11:47:20 +0100 Subject: [PATCH 54/58] add line on sample --- docs/usage.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/usage.md b/docs/usage.md index 842b4cdd..c723c594 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -10,6 +10,8 @@ Currently, the pipeline expects input data to be in a specific format. The `--input` should be `.fasta` or `.fa` (the same format but differing suffix). +The `--sample` is your chosen naming for the output files. + The `--cram` should point to the folder containing `.cram` files along with a `.crai` per `.cram`. The `--reads` should point to the folder containing `.fasta.gz` files. From be1c6920893e130ba69f6156f000988eb272569b Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Fri, 19 Sep 2025 11:53:06 +0100 Subject: [PATCH 55/58] linting --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 8553501e..52b99db1 100644 --- a/README.md +++ b/README.md @@ -38,7 +38,7 @@ Currently, the pipeline uses the following flags: - `--reads` - The directory of the fasta files generated from longread reads, e.g., `/path/to/fasta/` - - This folder *must* contain files in a `.fasta.gz` format, or they will be skipped by the internal file search function. + - This folder _must_ contain files in a `.fasta.gz` format, or they will be skipped by the internal file search function. - `--read_type` - The type of longread data you are utilising, e.g., ont, illumina, hifi. From 8aa53411de1450a324a7f8f470a2194fc5a1d1b8 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Fri, 19 Sep 2025 12:14:35 +0100 Subject: [PATCH 56/58] Update --- ro-crate-metadata.json | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/ro-crate-metadata.json b/ro-crate-metadata.json index fd7ddd4a..b39134f9 100644 --- a/ro-crate-metadata.json +++ b/ro-crate-metadata.json @@ -23,7 +23,7 @@ "@type": "Dataset", "creativeWorkStatus": "Stable", "datePublished": "2025-05-27T09:34:43+00:00", - "description": "# ![sanger-tol/curationpretext](docs/images/curationpretext-light.png#gh-light-mode-only) ![sanger-tol/curationpretext](docs/images/curationpretext-dark.png#gh-dark-mode-only)\n\n[![GitHub Actions CI Status](https://github.com/sanger-tol/curationpretext/actions/workflows/nf-test.yml/badge.svg)](https://github.com/sanger-tol/curationpretext/actions/workflows/nf-test.yml)\n[![GitHub Actions Linting Status](https://github.com/sanger-tol/curationpretext/actions/workflows/linting.yml/badge.svg)](https://github.com/sanger-tol/curationpretext/actions/workflows/linting.yml)[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.12773958-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.12773958)\n[![nf-test](https://img.shields.io/badge/unit_tests-nf--test-337ab7.svg)](https://www.nf-test.com)\n\n[![Nextflow](https://img.shields.io/badge/version-%E2%89%A524.10.5-green?style=flat&logo=nextflow&logoColor=white&color=%230DC09D&link=https%3A%2F%2Fnextflow.io)](https://www.nextflow.io/)\n[![nf-core template version](https://img.shields.io/badge/nf--core_template-3.3.2-green?style=flat&logo=nfcore&logoColor=white&color=%2324B064&link=https%3A%2F%2Fnf-co.re)](https://github.com/nf-core/tools/releases/tag/3.3.2)\n[![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda)](https://docs.conda.io/en/latest/)\n[![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/)\n[![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/)\n[![Launch on Seqera Platform](https://img.shields.io/badge/Launch%20%F0%9F%9A%80-Seqera%20Platform-%234256e7)](https://cloud.seqera.io/launch?pipeline=https://github.com/sanger-tol/curationpretext)\n\n## Introduction\n\n**sanger-tol/curationpretext** is a bioinformatics pipeline typically used in conjunction with [TreeVal](https://github.com/sanger-tol/treeval) to generate pretext maps (and optionally telomeric, gap, coverage, and repeat density plots which can be ingested into pretext) for the manual curation of high quality genomes.\n\nThis is intended as a supplementary pipeline for the [treeval](https://github.com/sanger-tol/treeval) project. This pipeline can be simply used to generate pretext maps, information on how to run this pipeline can be found in the [usage documentation](https://pipelines.tol.sanger.ac.uk/curationpretext/usage).\n\n![Workflow Diagram](./docs/images/CurationPretext_1_3_0.png)\n\n1. Generate Maps - Generates pretext maps as well as a static image.\n\n2. Accessory files - Generates the repeat density, gap, telomere, and coverage tracks.\n\n## Usage\n\n> [!NOTE]\n> If you are new to Nextflow and nf-core, please refer to [this page](https://nf-co.re/docs/usage/installation) on how to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline) with `-profile test` before running the workflow on actual data.\n\nCurrently, the pipeline uses the following flags:\n\n- `--input`\n - The absolute path to the assembled genome in, e.g., `/path/to/assembly.fa`\n\n- `--reads`\n - The directory of the fasta files generated from longread reads, e.g., `/path/to/fasta/`\n\n- `--read_type`\n - The type of longread data you are utilising, e.g., ont, illumina, hifi.\n\n- `--aligner`\n - The aligner yopu wish to use for the coverage generation, defaults to bwamem2 but minimap2 is also supported.\n\n- `--cram`\n - The directory of the cram _and_ cram.crai files, e.g., `/path/to/cram/`\n\n- `--map_order`\n - hic map scaffold order, input either `length` or `unsorted`\n\n- `--teloseq`\n - A telomeric sequence, e.g., `TTAGGG`\n\n- `--all_output`\n - An option to output all maps + accessory files, the default will only output the pretextmaps where ingestion has occured.\n\nNow, you can run the pipeline using:\n\n```bash\nnextflow run sanger-tol/curationpretext \\\n --input { input.fasta } \\\n --cram { path/to/cram/ } \\\n --reads { path/to/longread/fasta/ } \\\n --read_type { default is \"hifi\" }\n --sample { default is \"pretext_rerun\" } \\\n --teloseq { default is \"TTAGGG\" } \\\n --map_order { default is \"unsorted\" } \\\n --all_output \\\n --outdir { OUTDIR } \\\n -profile \n\n```\n\n> **Warning:**\n> Please provide pipeline parameters via the CLI or Nextflow `-params-file` option. Custom config files including those\n> provided by the `-c` Nextflow option can be used to provide any configuration _**except for parameters**_;\n\nFor more details, please refer to the [usage documentation](https://pipelines.tol.sanger.ac.uk/curationpretext/usage) and the [parameter documentation](https://pipelines.tol.sanger.ac.uk/curationpretext/parameters).\n\n## Pipeline output\n\nTo see the the results of a test run with a full size dataset refer to the [results](https://pipelines.tol.sanger.ac.uk/curationpretext/results) tab on the sanger-tol/curationpretext website pipeline page.\nFor more details about the output files and reports, please refer to the\n[output documentation](https://pipelines.tol.sanger.ac.uk/curationpretext/output).\n\n## Credits\n\nsanger-tol/curationpretext was originally written by Damon-Lee B Pointon (@DLBPointon).\n\nWe thank the following people for their extensive assistance in the development of this pipeline:\n\n- @muffato - For reviews.\n\n- @yumisims - TreeVal and Software.\n\n- @weaglesBio - TreeVal and Software.\n\n- @josieparis - Help with better docs and testing.\n\n- @mahesh-panchal - Large support with 1.2.0 in making the pipeline more robust with other HPC environments.\n\n- @GRIT - For feedback and feature requests.\n\n- @prototaxites - Support with 1.3.0 and showing me the power of GAWK.\n\n## Contributions and Support\n\nIf you would like to contribute to this pipeline, please see the [contributing guidelines](.github/CONTRIBUTING.md).\n\n## Citations\n\nIf you use sanger-tol/curationpretext for your analysis, please cite it using the following doi: [10.5281/zenodo.12773958](https://doi.org/10.5281/zenodo.12773958)\n\nAn extensive list of references for the tools used by the pipeline can be found in the [`CITATIONS.md`](CITATIONS.md) file.\n\nThis pipeline uses code and infrastructure developed and maintained by the [nf-core](https://nf-co.re) community, reused here under the [MIT license](https://github.com/nf-core/tools/blob/main/LICENSE).\n\n> **The nf-core framework for community-curated bioinformatics pipelines.**\n>\n> Philip Ewels, Alexander Peltzer, Sven Fillinger, Harshil Patel, Johannes Alneberg, Andreas Wilm, Maxime Ulysse Garcia, Paolo Di Tommaso & Sven Nahnsen.\n>\n> _Nat Biotechnol._ 2020 Feb 13. doi: [10.1038/s41587-020-0439-x](https://dx.doi.org/10.1038/s41587-020-0439-x).\n", + "description": "# ![sanger-tol/curationpretext](docs/images/curationpretext-light.png#gh-light-mode-only) ![sanger-tol/curationpretext](docs/images/curationpretext-dark.png#gh-dark-mode-only)\n\n[![GitHub Actions CI Status](https://github.com/sanger-tol/curationpretext/actions/workflows/nf-test.yml/badge.svg)](https://github.com/sanger-tol/curationpretext/actions/workflows/nf-test.yml)\n[![GitHub Actions Linting Status](https://github.com/sanger-tol/curationpretext/actions/workflows/linting.yml/badge.svg)](https://github.com/sanger-tol/curationpretext/actions/workflows/linting.yml)[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.12773958-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.12773958)\n[![nf-test](https://img.shields.io/badge/unit_tests-nf--test-337ab7.svg)](https://www.nf-test.com)\n\n[![Nextflow](https://img.shields.io/badge/version-%E2%89%A524.10.5-green?style=flat&logo=nextflow&logoColor=white&color=%230DC09D&link=https%3A%2F%2Fnextflow.io)](https://www.nextflow.io/)\n[![nf-core template version](https://img.shields.io/badge/nf--core_template-3.3.2-green?style=flat&logo=nfcore&logoColor=white&color=%2324B064&link=https%3A%2F%2Fnf-co.re)](https://github.com/nf-core/tools/releases/tag/3.3.2)\n[![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda)](https://docs.conda.io/en/latest/)\n[![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/)\n[![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/)\n[![Launch on Seqera Platform](https://img.shields.io/badge/Launch%20%F0%9F%9A%80-Seqera%20Platform-%234256e7)](https://cloud.seqera.io/launch?pipeline=https://github.com/sanger-tol/curationpretext)\n\n## Introduction\n\n**sanger-tol/curationpretext** is a bioinformatics pipeline typically used in conjunction with [TreeVal](https://github.com/sanger-tol/treeval) to generate pretext maps (and optionally telomeric, gap, coverage, and repeat density plots which can be ingested into pretext) for the manual curation of high quality genomes.\n\nThis is intended as a supplementary pipeline for the [treeval](https://github.com/sanger-tol/treeval) project. This pipeline can be simply used to generate pretext maps, information on how to run this pipeline can be found in the [usage documentation](https://pipelines.tol.sanger.ac.uk/curationpretext/usage).\n\n![Workflow Diagram](./docs/images/CurationPretext_1_3_0.png)\n\n1. Generate Maps - Generates pretext maps as well as a static image.\n\n2. Accessory files - Generates the repeat density, gap, telomere, and coverage tracks.\n\n## Usage\n\n> [!NOTE]\n> If you are new to Nextflow and nf-core, please refer to [this page](https://nf-co.re/docs/usage/installation) on how to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline) with `-profile test` before running the workflow on actual data.\n\nCurrently, the pipeline uses the following flags:\n\n- `--input`\n - The absolute path to the assembled genome in, e.g., `/path/to/assembly.fa`\n\n- `--sample`\n - Sample is the naming prefix of the output files, e.g. iyTipFemo\n\n- `--reads`\n - The directory of the fasta files generated from longread reads, e.g., `/path/to/fasta/`\n - This folder _must_ contain files in a `.fasta.gz` format, or they will be skipped by the internal file search function.\n\n- `--read_type`\n - The type of longread data you are utilising, e.g., ont, illumina, hifi.\n\n- `--aligner`\n - The aligner yopu wish to use for the coverage generation, defaults to bwamem2 but minimap2 is also supported.\n\n- `--cram`\n - The directory of the cram _and_ cram.crai files, e.g., `/path/to/cram/`\n\n- `--map_order`\n - hic map scaffold order, input either `length` or `unsorted`\n\n- `--teloseq`\n - A telomeric sequence, e.g., `TTAGGG`\n\n- `--all_output`\n - An option to output all maps + accessory files, the default will only output the pretextmaps where ingestion has occured.\n\nNow, you can run the pipeline using:\n\n```bash\nnextflow run sanger-tol/curationpretext \\\n --input { input.fasta } \\\n --cram { path/to/cram/ } \\\n --reads { path/to/longread/fasta/ } \\\n --read_type { default is \"hifi\" }\n --sample { default is \"pretext_rerun\" } \\\n --teloseq { default is \"TTAGGG\" } \\\n --map_order { default is \"unsorted\" } \\\n --all_output \\\n --outdir { OUTDIR } \\\n -profile \n\n```\n\n> **Warning:**\n> Please provide pipeline parameters via the CLI or Nextflow `-params-file` option. Custom config files including those\n> provided by the `-c` Nextflow option can be used to provide any configuration _**except for parameters**_;\n\nFor more details, please refer to the [usage documentation](https://pipelines.tol.sanger.ac.uk/curationpretext/usage) and the [parameter documentation](https://pipelines.tol.sanger.ac.uk/curationpretext/parameters).\n\n## Pipeline output\n\nTo see the the results of a test run with a full size dataset refer to the [results](https://pipelines.tol.sanger.ac.uk/curationpretext/results) tab on the sanger-tol/curationpretext website pipeline page.\nFor more details about the output files and reports, please refer to the\n[output documentation](https://pipelines.tol.sanger.ac.uk/curationpretext/output).\n\n## Credits\n\nsanger-tol/curationpretext was originally written by Damon-Lee B Pointon (@DLBPointon).\n\nWe thank the following people for their extensive assistance in the development of this pipeline:\n\n- @muffato - For reviews.\n\n- @yumisims - TreeVal and Software.\n\n- @weaglesBio - TreeVal and Software.\n\n- @josieparis - Help with better docs and testing.\n\n- @mahesh-panchal - Large support with 1.2.0 in making the pipeline more robust with other HPC environments.\n\n- @GRIT - For feedback and feature requests.\n\n- @prototaxites - Support with 1.3.0 and showing me the power of GAWK.\n\n## Contributions and Support\n\nIf you would like to contribute to this pipeline, please see the [contributing guidelines](.github/CONTRIBUTING.md).\n\n## Citations\n\nIf you use sanger-tol/curationpretext for your analysis, please cite it using the following doi: [10.5281/zenodo.12773958](https://doi.org/10.5281/zenodo.12773958)\n\nAn extensive list of references for the tools used by the pipeline can be found in the [`CITATIONS.md`](CITATIONS.md) file.\n\nThis pipeline uses code and infrastructure developed and maintained by the [nf-core](https://nf-co.re) community, reused here under the [MIT license](https://github.com/nf-core/tools/blob/main/LICENSE).\n\n> **The nf-core framework for community-curated bioinformatics pipelines.**\n>\n> Philip Ewels, Alexander Peltzer, Sven Fillinger, Harshil Patel, Johannes Alneberg, Andreas Wilm, Maxime Ulysse Garcia, Paolo Di Tommaso & Sven Nahnsen.\n>\n> _Nat Biotechnol._ 2020 Feb 13. doi: [10.1038/s41587-020-0439-x](https://dx.doi.org/10.1038/s41587-020-0439-x).\n", "hasPart": [ { "@id": "main.nf" @@ -124,7 +124,11 @@ }, { "@id": "main.nf", - "@type": ["File", "SoftwareSourceCode", "ComputationalWorkflow"], + "@type": [ + "File", + "SoftwareSourceCode", + "ComputationalWorkflow" + ], "creator": [ { "@id": "https://orcid.org/0000-0002-7860-3560" @@ -133,9 +137,16 @@ "dateCreated": "", "dateModified": "2025-05-27T10:34:43Z", "dct:conformsTo": "https://bioschemas.org/profiles/ComputationalWorkflow/1.0-RELEASE/", - "keywords": ["nf-core", "nextflow"], - "license": ["MIT"], - "name": ["sanger-tol/curationpretext"], + "keywords": [ + "nf-core", + "nextflow" + ], + "license": [ + "MIT" + ], + "name": [ + "sanger-tol/curationpretext" + ], "programmingLanguage": { "@id": "https://w3id.org/workflowhub/workflow-ro-crate#nextflow" }, @@ -146,7 +157,9 @@ "https://github.com/sanger-tol/curationpretext", "https://pipelines.tol.sanger.ac.uk/sanger-tol/curationpretext/1.4.2/" ], - "version": ["1.4.2"] + "version": [ + "1.4.2" + ] }, { "@id": "https://w3id.org/workflowhub/workflow-ro-crate#nextflow", @@ -332,4 +345,4 @@ "name": "Josie Paris" } ] -} +} \ No newline at end of file From a74b9e12fee0c3d92abafd8e07999bea68bf1ccf Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Fri, 19 Sep 2025 13:26:27 +0100 Subject: [PATCH 57/58] Pre-commit linting --- ro-crate-metadata.json | 25 ++++++------------------- 1 file changed, 6 insertions(+), 19 deletions(-) diff --git a/ro-crate-metadata.json b/ro-crate-metadata.json index b39134f9..c8de9e84 100644 --- a/ro-crate-metadata.json +++ b/ro-crate-metadata.json @@ -124,11 +124,7 @@ }, { "@id": "main.nf", - "@type": [ - "File", - "SoftwareSourceCode", - "ComputationalWorkflow" - ], + "@type": ["File", "SoftwareSourceCode", "ComputationalWorkflow"], "creator": [ { "@id": "https://orcid.org/0000-0002-7860-3560" @@ -137,16 +133,9 @@ "dateCreated": "", "dateModified": "2025-05-27T10:34:43Z", "dct:conformsTo": "https://bioschemas.org/profiles/ComputationalWorkflow/1.0-RELEASE/", - "keywords": [ - "nf-core", - "nextflow" - ], - "license": [ - "MIT" - ], - "name": [ - "sanger-tol/curationpretext" - ], + "keywords": ["nf-core", "nextflow"], + "license": ["MIT"], + "name": ["sanger-tol/curationpretext"], "programmingLanguage": { "@id": "https://w3id.org/workflowhub/workflow-ro-crate#nextflow" }, @@ -157,9 +146,7 @@ "https://github.com/sanger-tol/curationpretext", "https://pipelines.tol.sanger.ac.uk/sanger-tol/curationpretext/1.4.2/" ], - "version": [ - "1.4.2" - ] + "version": ["1.4.2"] }, { "@id": "https://w3id.org/workflowhub/workflow-ro-crate#nextflow", @@ -345,4 +332,4 @@ "name": "Josie Paris" } ] -} \ No newline at end of file +} From b921914fa9757083ef9a2f7712689611a7db6a0a Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Fri, 19 Sep 2025 13:50:09 +0100 Subject: [PATCH 58/58] Updates --- CHANGELOG.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 901bff0c..77dbc52b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,7 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added and Fixed -- Template update to 3.3.3. . +- Template update to 3.3.2. - Addition of the `--split_telomere` boolean flag, this is false by default. - When `true` the pipeline will split the telomere file into a 5 and 3 prime file. - Update `ACCESSORY_FILES` subworkflow: @@ -28,6 +28,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Update tests. - Added an "AUTO" value to the `--aligner` arg. If a genome is >5Gb it will use minimap2 else bwamem2. - Parity update for the base.config to match TreeVal. +- Minor Doc updates. +- Comment out the CONDA workflow requirement, pipeline does not support conda. ### Paramters