forked from sanger-tol/treeval
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain.nf
More file actions
executable file
·118 lines (104 loc) · 3.8 KB
/
main.nf
File metadata and controls
executable file
·118 lines (104 loc) · 3.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
#!/usr/bin/env nextflow
// This subworkflow takes an input fasta sequence and csv style list of organisms to return
// bigbed files containing alignment data between the input fasta and csv style organism names.
// Input - Assembled genomic fasta file
// Output - A BigBed file per datatype per organism entered via csv style in the yaml.
//
// MODULE IMPORT BLOCK
//
include { BUSCO_BUSCO } from '../../../modules/nf-core/busco/busco/main'
include { UCSC_BEDTOBIGBED } from '../../../modules/nf-core/ucsc/bedtobigbed/main'
include { BEDTOOLS_SORT } from '../../../modules/nf-core/bedtools/sort/main'
include { GAWK as GAWK_EXTRACT_BUSCOGENE } from '../../../modules/nf-core/gawk/main'
//
// SUBWORKFLOW IMPORT BLOCK
//
include { ANCESTRAL_GENE } from '../ancestral_gene/main'
workflow BUSCO_ANNOTATION {
take:
dot_genome // Channel: tuple [val(meta), [ datafile ]]
reference_tuple // Channel: tuple [val(meta), [ datafile ]]
lineageinfo // Channel: val(lineage_db)
lineagespath // Channel: val(/path/to/buscoDB)
buscogene_as // Channel: val(dot_as location)
ancestral_table // Channel: val(ancestral_table location)
main:
ch_versions = Channel.empty()
//
// MODULE: RUN BUSCO TO OBTAIN FULL_TABLE.CSV
// EMITS FULL_TABLE.CSV
//
BUSCO_BUSCO (
reference_tuple,
"genome",
lineageinfo,
lineagespath,
[]
)
ch_versions = ch_versions.mix(BUSCO_BUSCO.out.versions.first())
ch_busco_full_table = BUSCO_BUSCO.out.busco_dir.map { meta, dir -> tuple(meta, files(dir.resolve("*/*/full_table.tsv"), checkIfExists: true)) }
//
// MODULE: EXTRACT THE BUSCO GENES FOUND IN REFERENCE
//
GAWK_EXTRACT_BUSCOGENE (
ch_busco_full_table,
file("${projectDir}/bin/get_busco_gene.awk"),
false
)
ch_versions = ch_versions.mix( GAWK_EXTRACT_BUSCOGENE.out.versions )
//
// LOGIC: ADDING LINE COUNT TO THE FILE FOR BETTER RESOURCE USAGE
//
GAWK_EXTRACT_BUSCOGENE.out.output
.map { meta, file ->
tuple ( [ id: meta.id,
lines: file.countLines()
],
file
)
}
.set { bedtools_input }
//
// MODULE: SORT THE EXTRACTED BUSCO GENE
//
BEDTOOLS_SORT(
bedtools_input,
[]
)
ch_versions = ch_versions.mix( BEDTOOLS_SORT.out.versions )
//
// MODULE: CONVERT THE BED TO BIGBED
//
UCSC_BEDTOBIGBED(
BEDTOOLS_SORT.out.sorted,
dot_genome.map{it[1]}, // Gets file from tuple (meta, file)
buscogene_as
)
ch_versions = ch_versions.mix( UCSC_BEDTOBIGBED.out.versions )
//
// SUBWORKFLOW: RUN ANCESTRAL BUSCO ID (ONLY AVAILABLE FOR LEPIDOPTERA)
// LOGIC: AGGREGATE DATA AND FILTER ON CLASS
//
lineageinfo
.combine(ch_busco_full_table)
.combine(ancestral_table)
.filter { lineage, _meta, _btable, _atable ->
lineage.split('_')[0] == "lepidoptera"
}
.multiMap { _lineage, meta, busco_full_table, ancestral_table_ ->
busco_table: tuple( meta, busco_full_table )
atable: ancestral_table_
}
.set{ ch_busco_lep_data }
ANCESTRAL_GENE (
ch_busco_lep_data.busco_table,
dot_genome,
buscogene_as,
ch_busco_lep_data.atable
)
ch_versions = ch_versions.mix( ANCESTRAL_GENE.out.versions )
emit:
ch_buscogene_bigbed = UCSC_BEDTOBIGBED.out.bigbed
ch_ancestral_bigbed = ANCESTRAL_GENE.out.ch_ancestral_bigbed
versions = ch_versions
}