-
Notifications
You must be signed in to change notification settings - Fork 2
Update to use bioawk #216
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Update to use bioawk #216
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,6 +1,7 @@ | ||
| // | ||
| // MODULE IMPORT BLOCK | ||
| // | ||
| include { BIOAWK } from '../../../modules/nf-core/bioawk/main' | ||
| include { TELOMERE_REGIONS } from '../../../modules/sanger-tol/telomere/regions/main' | ||
| include { GAWK as GAWK_SPLIT_TELOMERE } from '../../../modules/nf-core/gawk/main' | ||
| include { TELOMERE_WINDOWS } from '../../../modules/sanger-tol/telomere/windows/main' | ||
|
|
@@ -12,20 +13,42 @@ workflow TELO_FINDER { | |
|
|
||
| take: | ||
| ch_reference // Channel [ val(meta), path(fasta) ] | ||
| ch_telomereseq // Channel.of( telomere sequence ) | ||
| ch_telomereseq // Channel [ val(meta), path(fasta) ] | ||
| val_split_telomere // bool | ||
| val_run_bgzip // bool | ||
|
|
||
| main: | ||
|
|
||
| //if G > 30% then flip else pass | ||
| // | ||
| // MODULE: BIOAWK CONVERT THE MOTIF INTO THE 5 PRIME DIRECTION | ||
| // IF PROVIDED IN THE 3 PRIME DIRECTION | ||
| // IF MOTIF HAS A G CONTENT OF > 30% IT IS IN THE 3 PRIME | ||
| // | ||
| BIOAWK( | ||
| ch_telomereseq, | ||
| "tsv" | ||
| ) | ||
|
|
||
|
|
||
| // | ||
| // LOGIC: READ LINE 2 OF THE OUTPUT FILE | ||
| // | ||
| corrected_telomere = BIOAWK.out.output | ||
| .map { _meta, file -> | ||
| def lines = file.toFile().readLines() | ||
| // Lines from bioawk are: | ||
| // corrected_sequence G_count G_percentage reversed? original_sequence | ||
| lines[0].split('\t')[0] | ||
| } | ||
| .filter { it != null } | ||
|
|
||
|
Comment on lines
+36
to
+44
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should this keep
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. in all fairness there should never be a null. If there's a null you've forgotten to include a telomere and will have other issues. |
||
|
|
||
| // | ||
| // MODULE: FINDS THE TELOMERIC SEQEUNCE IN REFERENCE | ||
| // | ||
| TELOMERE_REGIONS ( | ||
| ch_reference, | ||
| ch_telomereseq | ||
| corrected_telomere | ||
| ) | ||
DLBPointon marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| ch_full_telomere = TELOMERE_REGIONS.out.telomere | ||
|
|
@@ -89,7 +112,7 @@ workflow TELO_FINDER { | |
| // THIS ONLY HAPPENS ON WHOLE TELOMERIC FILES | ||
| // | ||
| TELOMERE_WINDOWS ( | ||
| ch_regions_for_extraction.filter { meta, file -> meta.direction == 0 } | ||
| ch_regions_for_extraction.filter { meta, _file -> meta.direction == 0 } | ||
| ) | ||
|
|
||
| // | ||
|
|
@@ -140,8 +163,9 @@ workflow TELO_FINDER { | |
| ) | ||
|
|
||
| emit: | ||
| bed_file = ch_telo_bedfiles // Channel [meta, bed] | ||
| bed_gz_tbi = TABIX_BGZIPTABIX.out.gz_index // Not used anymore | ||
| bedgraph_file = ch_telo_bedgraphs // Channel [meta, [bedfiles]] - Used in pretext_graph | ||
| telomere_summary = BIOAWK.out.output // Channel [meta, tsv] | ||
| bed_file = ch_telo_bedfiles // Channel [meta, bed] | ||
| bed_gz_tbi = TABIX_BGZIPTABIX.out.gz_index // Channel [meta, index] | ||
| bedgraph_file = ch_telo_bedgraphs // Channel [meta, [bedfiles]] - Used in pretext_graph | ||
|
|
||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,6 +1,11 @@ | ||
| nextflow.enable.moduleBinaries = true | ||
|
|
||
| process { | ||
|
|
||
| withName: BIOAWK { | ||
| ext.args = { "-c fastx \'{s = toupper($seq); copy_s = s; g = gsub(/G/, \"\", s); pct = 100*g/length(copy_s); rev = (pct < 30); out = rev ? revcomp(\$seq) : \$seq; printf \"%s\t%d\t%.2f\t%s\t%s\\n\", out, g, pct, (rev ? \"true\" : \"false\"), copy_s }\'" } | ||
| } | ||
|
Comment on lines
+5
to
+7
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. One last question, very much optional, and which could have been prompted when you were doing the BIOAWK module before 😅 - would it be worth making the bioawk module more like the GAWK module and be able to take a program file? Then you could write this as a value channel in the subworkflow script?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I did think about that, it would definately clean it up. But chose the path of least resistance. I don't know if it can take a file as input to be honest, I'll mock up a test and get back to you. Edit: actually right in the help line
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I checked the
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah looks good: I'll open up the modules repo again
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Rip it apart @prototaxites ! |
||
|
|
||
| withName: TELOMERE_WINDOWS { | ||
| tag = { "${meta.id}_${meta.direction}P" } | ||
| ext.args = "99.9" | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -12,10 +12,12 @@ nextflow_workflow { | |
| tag "telomere/windows" | ||
| tag "telomere/extract" | ||
| tag "tabix/bgziptabix" | ||
| tag "subworkflows/../../modules/nf-core/bioawk" | ||
| tag "subworkflows/../../modules/nf-core/gawk" | ||
| tag "subworkflows/../../modules/nf-core/gunzip" | ||
| tag "subworkflows/../../modules/nf-core/tabix/bgziptabix" | ||
| tag "modules/nf-core/gunzip" | ||
| tag "modules/nf-core/bioawk" | ||
|
|
||
| setup { | ||
| nfcoreInitialise("${launchDir}/library/") | ||
|
|
@@ -24,7 +26,8 @@ nextflow_workflow { | |
| [ | ||
| "gawk", | ||
| "tabix/bgziptabix", | ||
| "gunzip" | ||
| "gunzip", | ||
| "bioawk" | ||
| ] | ||
| ) | ||
| nfcoreLink("${launchDir}/library/", "${baseDir}/modules/") | ||
|
|
@@ -46,14 +49,18 @@ nextflow_workflow { | |
| test("idFanCani4 - no split - fasta w/ index") { | ||
| when { | ||
| params { | ||
| bioawk_command = "-c fastx \'{s = toupper(\$seq); copy_s = s; g = gsub(/G/, \"\", s); pct = 100*g/length(copy_s); rev = (pct > 30); out = rev ? revcomp(\$seq) : \$seq; printf \"%s\\t%d\\t%.2f\\t%s\\t%s\\n\", out, g, pct, (rev ? \"true\" : \"false\"), copy_s }\'" | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The benefit is that json could be reused in other workflows, but thinking about it, this should already be sufficient. |
||
| windows_percent = "99.9" | ||
| bgzip_args = "--csi" | ||
| } | ||
|
|
||
| workflow { | ||
| """ | ||
| input[0] = GUNZIP.out.gunzip | ||
| input[1] = "TTAGG" | ||
| input[1] = [ | ||
| [ id: "motifs" ], | ||
| file(params.modules_testdata_base_path + 'resources/modules/telomere/generic/telomere_motif.fasta', checkIfExists: true) | ||
| ] | ||
| input[2] = false | ||
| input[3] = true | ||
| """ | ||
|
|
@@ -75,14 +82,18 @@ nextflow_workflow { | |
| test("idFanCani4 - split - fasta w/ index") { | ||
| when { | ||
| params { | ||
| bioawk_command = "-c fastx \'{s = toupper(\$seq); copy_s = s; g = gsub(/G/, \"\", s); pct = 100*g/length(copy_s); rev = (pct > 30); out = rev ? revcomp(\$seq) : \$seq; printf \"%s\\t%d\\t%.2f\\t%s\\t%s\\n\", out, g, pct, (rev ? \"true\" : \"false\"), copy_s }\'" | ||
| windows_percent = "99.9" | ||
| bgzip_args = "--csi" | ||
| } | ||
|
|
||
| workflow { | ||
| """ | ||
| input[0] = GUNZIP.out.gunzip | ||
| input[1] = "TTAGG" | ||
| input[1] = [ | ||
| [ id: "motifs" ], | ||
| file(params.modules_testdata_base_path + 'resources/modules/telomere/generic/telomere_motif.fasta', checkIfExists: true) | ||
| ] | ||
| input[2] = true | ||
| input[3] = true | ||
| """ | ||
|
|
@@ -104,14 +115,18 @@ nextflow_workflow { | |
| test("idFanCani4 - no split - fasta w/o index") { | ||
| when { | ||
| params { | ||
| bioawk_command = "-c fastx \'{s = toupper(\$seq); copy_s = s; g = gsub(/G/, \"\", s); pct = 100*g/length(copy_s); rev = (pct > 30); out = rev ? revcomp(\$seq) : \$seq; printf \"%s\\t%d\\t%.2f\\t%s\\t%s\\n\", out, g, pct, (rev ? \"true\" : \"false\"), copy_s }\'" | ||
| windows_percent = "99.9" | ||
| bgzip_args = "--csi" | ||
| } | ||
|
|
||
| workflow { | ||
| """ | ||
| input[0] = GUNZIP.out.gunzip | ||
| input[1] = "TTAGG" | ||
| input[1] = [ | ||
| [ id: "motifs" ], | ||
| file(params.modules_testdata_base_path + 'resources/modules/telomere/generic/telomere_motif.fasta', checkIfExists: true) | ||
| ] | ||
| input[2] = false | ||
| input[3] = false | ||
| """ | ||
|
|
@@ -133,14 +148,18 @@ nextflow_workflow { | |
| test("idFanCani4 - split - fasta w/o index") { | ||
| when { | ||
| params { | ||
| bioawk_command = "-c fastx \'{s = toupper(\$seq); copy_s = s; g = gsub(/G/, \"\", s); pct = 100*g/length(copy_s); rev = (pct > 30); out = rev ? revcomp(\$seq) : \$seq; printf \"%s\\t%d\\t%.2f\\t%s\\t%s\\n\", out, g, pct, (rev ? \"true\" : \"false\"), copy_s }\'" | ||
| windows_percent = "99.9" | ||
| bgzip_args = "--csi" | ||
| } | ||
|
|
||
| workflow { | ||
| """ | ||
| input[0] = GUNZIP.out.gunzip | ||
| input[1] = "TTAGG" | ||
| input[1] = [ | ||
| [ id: "motifs" ], | ||
| file(params.modules_testdata_base_path + 'resources/modules/telomere/generic/telomere_motif.fasta', checkIfExists: true) | ||
| ] | ||
| input[2] = true | ||
| input[3] = false | ||
| """ | ||
|
|
@@ -162,14 +181,18 @@ nextflow_workflow { | |
| test("idFanCani4 - no split - fasta - stub w/o index") { | ||
| when { | ||
| params { | ||
| bioawk_command = "-c fastx \'{s = toupper(\$seq); copy_s = s; g = gsub(/G/, \"\", s); pct = 100*g/length(copy_s); rev = (pct > 30); out = rev ? revcomp(\$seq) : \$seq; printf \"%s\\t%d\\t%.2f\\t%s\\t%s\\n\", out, g, pct, (rev ? \"true\" : \"false\"), copy_s }\'" | ||
| windows_percent = "99.9" | ||
| bgzip_args = "--csi" | ||
| } | ||
|
|
||
| workflow { | ||
| """ | ||
| input[0] = GUNZIP.out.gunzip | ||
| input[1] = "TTAGG" | ||
| input[1] = [ | ||
| [ id: "motifs" ], | ||
| file(params.modules_testdata_base_path + 'resources/modules/telomere/generic/telomere_motif.fasta', checkIfExists: true) | ||
| ] | ||
| input[2] = false | ||
| input[3] = false | ||
| """ | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.