sanger-tol · DLBPointon · Mar 26, 2026 · Mar 26, 2026 · Mar 26, 2026 · prototaxites
diff --git a/subworkflows/sanger-tol/telo_finder/main.nf b/subworkflows/sanger-tol/telo_finder/main.nf
@@ -1,6 +1,7 @@
 //
 // MODULE IMPORT BLOCK
 //
+include { BIOAWK                        } from '../../../modules/nf-core/bioawk/main'
 include { TELOMERE_REGIONS              } from '../../../modules/sanger-tol/telomere/regions/main'
 include { GAWK as GAWK_SPLIT_TELOMERE   } from '../../../modules/nf-core/gawk/main'
 include { TELOMERE_WINDOWS              } from '../../../modules/sanger-tol/telomere/windows/main'
@@ -12,20 +13,42 @@ workflow TELO_FINDER {
 
     take:
     ch_reference        // Channel [ val(meta), path(fasta) ]
-    ch_telomereseq      // Channel.of( telomere sequence )
+    ch_telomereseq      // Channel [ val(meta), path(fasta) ]
     val_split_telomere  // bool
     val_run_bgzip       // bool
 
     main:
 
-    //if G > 30% then flip else pass
+    //
+    // MODULE: BIOAWK CONVERT THE MOTIF INTO THE 5 PRIME DIRECTION
+    //         IF PROVIDED IN THE 3 PRIME DIRECTION
+    //         IF MOTIF HAS A G CONTENT OF > 30% IT IS IN THE 3 PRIME
+    //
+    BIOAWK(
+        ch_telomereseq,
+        "tsv"
+    )
+
+
+    //
+    // LOGIC: READ LINE 2 OF THE OUTPUT FILE
+    //
+    corrected_telomere = BIOAWK.out.output
+        .map { _meta, file ->
+            def lines = file.toFile().readLines()
+            // Lines from bioawk are:
+            // corrected_sequence  G_count  G_percentage  reversed?  original_sequence
+            lines[0].split('\t')[0]
+        }
+        .filter { it != null }
+
 
     //
     // MODULE: FINDS THE TELOMERIC SEQEUNCE IN REFERENCE
     //
     TELOMERE_REGIONS (
         ch_reference,
-        ch_telomereseq
+        corrected_telomere
     )
 
     ch_full_telomere = TELOMERE_REGIONS.out.telomere
@@ -89,7 +112,7 @@ workflow TELO_FINDER {
     //         THIS ONLY HAPPENS ON WHOLE TELOMERIC FILES
     //
     TELOMERE_WINDOWS (
-        ch_regions_for_extraction.filter { meta, file -> meta.direction == 0 }
+        ch_regions_for_extraction.filter { meta, _file -> meta.direction == 0 }
     )
 
     //
@@ -140,8 +163,9 @@ workflow TELO_FINDER {
     )
 
     emit:
-    bed_file        = ch_telo_bedfiles          // Channel [meta, bed]
-    bed_gz_tbi      = TABIX_BGZIPTABIX.out.gz_index  // Not used anymore
-    bedgraph_file   = ch_telo_bedgraphs         // Channel [meta, [bedfiles]] - Used in pretext_graph
+    telomere_summary    = BIOAWK.out.output             // Channel [meta, tsv]
+    bed_file            = ch_telo_bedfiles              // Channel [meta, bed]
+    bed_gz_tbi          = TABIX_BGZIPTABIX.out.gz_index // Channel [meta, index]
+    bedgraph_file       = ch_telo_bedgraphs             // Channel [meta, [bedfiles]] - Used in pretext_graph
 
 }
diff --git a/subworkflows/sanger-tol/telo_finder/meta.yml b/subworkflows/sanger-tol/telo_finder/meta.yml
@@ -11,6 +11,8 @@ components:
   - telomere/regions
   - telomere/windows
   - telomere/extract
+  - bioawk:
+      git_remote: https://github.com/nf-core/modules.git
   - gawk:
       git_remote: https://github.com/nf-core/modules.git
   - tabix/bgziptabix:
@@ -23,8 +25,10 @@ input:
         Meta is the Groovy Map containing sample information
         Reference is the fasta for analysis
   - ch_telomereseq:
-      type: string
+      type: file
       description: |
+        Structure [ val(meta), path(reference) ]
+        Meta is the Groovy Map containing sample information
         A string containing the DNA sequence of a telomere motif
   - val_split_telomere:
       type: boolean
@@ -35,6 +39,11 @@ input:
       description: |
         Control running of tabix with boolean
 output:
+  - telomere_summary:
+      type: file
+      description: |
+        Structure: [ val(meta), path(tsv) ]
+        A tsv file summarising the telomere regions found.
   - bed_file:
       type: file
       description: |

diff --git a/subworkflows/sanger-tol/telo_finder/nextflow.config b/subworkflows/sanger-tol/telo_finder/nextflow.config
@@ -1,6 +1,11 @@
 nextflow.enable.moduleBinaries = true
 
 process {
+
+    withName: BIOAWK {
+        ext.args = { "-c fastx \'{s = toupper($seq); copy_s = s; g = gsub(/G/, \"\", s); pct = 100*g/length(copy_s); rev = (pct < 30); out = rev ? revcomp(\$seq) : \$seq; printf \"%s\t%d\t%.2f\t%s\t%s\\n\", out, g, pct, (rev ? \"true\" : \"false\"), copy_s }\'" }
+    }
+
     withName: TELOMERE_WINDOWS {
         tag         = { "${meta.id}_${meta.direction}P" }
         ext.args    = "99.9"

diff --git a/subworkflows/sanger-tol/telo_finder/tests/main.nf.test b/subworkflows/sanger-tol/telo_finder/tests/main.nf.test
@@ -12,10 +12,12 @@ nextflow_workflow {
     tag "telomere/windows"
     tag "telomere/extract"
     tag "tabix/bgziptabix"
+    tag "subworkflows/../../modules/nf-core/bioawk"
     tag "subworkflows/../../modules/nf-core/gawk"
     tag "subworkflows/../../modules/nf-core/gunzip"
     tag "subworkflows/../../modules/nf-core/tabix/bgziptabix"
     tag "modules/nf-core/gunzip"
+    tag "modules/nf-core/bioawk"
 
     setup {
         nfcoreInitialise("${launchDir}/library/")
@@ -24,7 +26,8 @@ nextflow_workflow {
             [
                 "gawk",
                 "tabix/bgziptabix",
-                "gunzip"
+                "gunzip",
+                "bioawk"
             ]
         )
         nfcoreLink("${launchDir}/library/", "${baseDir}/modules/")
@@ -46,14 +49,18 @@ nextflow_workflow {
     test("idFanCani4 - no split - fasta w/ index") {
         when {
             params {
+                bioawk_command = "-c fastx \'{s = toupper(\$seq); copy_s = s; g = gsub(/G/, \"\", s); pct = 100*g/length(copy_s); rev = (pct > 30); out = rev ? revcomp(\$seq) : \$seq; printf \"%s\\t%d\\t%.2f\\t%s\\t%s\\n\", out, g, pct, (rev ? \"true\" : \"false\"), copy_s }\'"
                 windows_percent = "99.9"
                 bgzip_args       = "--csi"
             }
 
             workflow {
                 """
                 input[0] = GUNZIP.out.gunzip
-                input[1] = "TTAGG"
+                input[1] = [
+                    [ id: "motifs" ],
+                    file(params.modules_testdata_base_path + 'resources/modules/telomere/generic/telomere_motif.fasta', checkIfExists: true)
+                ]
                 input[2] = false
                 input[3] = true
                 """
@@ -75,14 +82,18 @@ nextflow_workflow {
     test("idFanCani4 - split - fasta w/ index") {
         when {
             params {
+                bioawk_command = "-c fastx \'{s = toupper(\$seq); copy_s = s; g = gsub(/G/, \"\", s); pct = 100*g/length(copy_s); rev = (pct > 30); out = rev ? revcomp(\$seq) : \$seq; printf \"%s\\t%d\\t%.2f\\t%s\\t%s\\n\", out, g, pct, (rev ? \"true\" : \"false\"), copy_s }\'"
                 windows_percent = "99.9"
                 bgzip_args       = "--csi"
             }
 
             workflow {
                 """
                 input[0] = GUNZIP.out.gunzip
-                input[1] = "TTAGG"
+                input[1] = [
+                    [ id: "motifs" ],
+                    file(params.modules_testdata_base_path + 'resources/modules/telomere/generic/telomere_motif.fasta', checkIfExists: true)
+                ]
                 input[2] = true
                 input[3] = true
                 """
@@ -104,14 +115,18 @@ nextflow_workflow {
     test("idFanCani4 - no split - fasta w/o index") {
         when {
             params {
+                bioawk_command = "-c fastx \'{s = toupper(\$seq); copy_s = s; g = gsub(/G/, \"\", s); pct = 100*g/length(copy_s); rev = (pct > 30); out = rev ? revcomp(\$seq) : \$seq; printf \"%s\\t%d\\t%.2f\\t%s\\t%s\\n\", out, g, pct, (rev ? \"true\" : \"false\"), copy_s }\'"
                 windows_percent = "99.9"
                 bgzip_args       = "--csi"
             }
 
             workflow {
                 """
                 input[0] = GUNZIP.out.gunzip
-                input[1] = "TTAGG"
+                input[1] = [
+                    [ id: "motifs" ],
+                    file(params.modules_testdata_base_path + 'resources/modules/telomere/generic/telomere_motif.fasta', checkIfExists: true)
+                ]
                 input[2] = false
                 input[3] = false
                 """
@@ -133,14 +148,18 @@ nextflow_workflow {
     test("idFanCani4 - split - fasta w/o index") {
         when {
             params {
+                bioawk_command = "-c fastx \'{s = toupper(\$seq); copy_s = s; g = gsub(/G/, \"\", s); pct = 100*g/length(copy_s); rev = (pct > 30); out = rev ? revcomp(\$seq) : \$seq; printf \"%s\\t%d\\t%.2f\\t%s\\t%s\\n\", out, g, pct, (rev ? \"true\" : \"false\"), copy_s }\'"
                 windows_percent = "99.9"
                 bgzip_args       = "--csi"
             }
 
             workflow {
                 """
                 input[0] = GUNZIP.out.gunzip
-                input[1] = "TTAGG"
+                input[1] = [
+                    [ id: "motifs" ],
+                    file(params.modules_testdata_base_path + 'resources/modules/telomere/generic/telomere_motif.fasta', checkIfExists: true)
+                ]
                 input[2] = true
                 input[3] = false
                 """
@@ -162,14 +181,18 @@ nextflow_workflow {
     test("idFanCani4 - no split - fasta - stub w/o index") {
         when {
             params {
+                bioawk_command = "-c fastx \'{s = toupper(\$seq); copy_s = s; g = gsub(/G/, \"\", s); pct = 100*g/length(copy_s); rev = (pct > 30); out = rev ? revcomp(\$seq) : \$seq; printf \"%s\\t%d\\t%.2f\\t%s\\t%s\\n\", out, g, pct, (rev ? \"true\" : \"false\"), copy_s }\'"
                 windows_percent = "99.9"
                 bgzip_args       = "--csi"
             }
 
             workflow {
                 """
                 input[0] = GUNZIP.out.gunzip
-                input[1] = "TTAGG"
+                input[1] = [
+                    [ id: "motifs" ],
+                    file(params.modules_testdata_base_path + 'resources/modules/telomere/generic/telomere_motif.fasta', checkIfExists: true)
+                ]
                 input[2] = false
                 input[3] = false
                 """