GeneLab_AmpliconSeq_Workflow/workflow_code/nextflow.config at main · nasa/GeneLab_AmpliconSeq_Workflow · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
plugins {
  id 'nf-schema@2.5.1'
}
//***************************************** Global parameters *******************************************//
params {
    assay_suffix   = "_GLAmpSeq"
    output_prefix = ""

    // Mandatory parameters
    target_region = "16S" // "16S", "18S", "ITS"
    raw_R1_suffix = "${params.assay_suffix}_R1_raw.fastq.gz"
    raw_R2_suffix = "${params.assay_suffix}_R2_raw.fastq.gz"
    trim_primers  = "TRUE" // "TRUE" or "FALSE"


    // -------- Required only if --accession is false ---------------//
    // A 4-column (single-end) or 5-column (paired-end) input csv file with the following headers ( sample_id, forward, [reverse,] paired, groups)
    input_file = null


    // Cutadapt parameters
    min_cutadapt_len    = 130
    primers_linked      = "TRUE"
    discard_untrimmed   = "TRUE"
    anchored_primers    = "TRUE"
    F_primer            = null
    R_primer            = null

    // Dada2 parameters
    left_trunc     = 0
    right_trunc    = 0
    left_maxEE     = 1
    right_maxEE    = 1
    concatenate_reads_only = "FALSE"

    // If using conda environments specify their locations so new ones won't be created
    // Specify the paths to existing conda environments (/path/to/envs/cutadapt)
    // leave as is if you want to create a new conda environment
    conda_cutadapt         = null     // /path/to/envs/cutadapt
    conda_diversity        = null     // /path/to/envs/R_diversity
    conda_dp_tools         = null     // /path/to/envs/dp_tools
    conda_fastqc           = null     // /path/to/envs/fastqc
    conda_multiqc          = null     // /path/to/envs/multiqc
    conda_R                = null     // /path/to/envs/R
    conda_zip              = null     // /path/to/envs/zip
    conda_wget             = null     // /path/to/envs/wget


    // Mandatory parameters if using GLDS or OSD accession as input
    accession = null

    publishDir_mode = "link" // "link", "copy"

    // Suffixes
    primer_trimmed_R1_suffix = "${params.assay_suffix}_R1_trimmed.fastq.gz"
    primer_trimmed_R2_suffix = "${params.assay_suffix}_R2_trimmed.fastq.gz"
    filtered_R1_suffix       = "${params.assay_suffix}_R1_filtered.fastq.gz"
    filtered_R2_suffix       = "${params.assay_suffix}_R2_filtered.fastq.gz"


    // Directories
    outdir              = "${launchDir}"
    raw_reads_dir       = "${params.outdir}/Raw_Sequence_Data/"
    metadata_dir        = "${params.outdir}/Metadata/"
    genelab_dir         = "${params.outdir}/GeneLab/"
    fastqc_out_dir      = "${params.outdir}/workflow_output/FastQC_Outputs/"
    trimmed_reads_dir   = "${params.outdir}/workflow_output/Trimmed_Sequence_Data/"
    filtered_reads_dir  = "${params.outdir}/workflow_output/Filtered_Sequence_Data/"
    final_outputs_dir   = "${params.outdir}/workflow_output/Final_Outputs/"

    // Multiqc
    multiqc_config = "${projectDir}/config/multiqc.config"

    // -------- Differential abundance parameters ----- //
    diff_abund_method  = "all" // ["all", "ancombc1", "ancombc2", or "deseq2"] - it runs all three by default
    group              = "groups"  // column in input csv file to be compared
    samples_column     = "sample_id" // column in input csv file containing sample names
    remove_struc_zeros = false // should structural zeros be removed when running ANCOMBC?
    // Should rare features and samples be discarded. Values are true or false. If set to true then set the cutoffs below
    remove_rare        = false
    prevalence_cutoff  = 0  // a fraction between 0 and 1 that represents the prevalance in percentage of taxa to be retained
    library_cutoff     = 0 // Samples with library sizes less than this number will be excluded in the analysis

    // Minimum desired sample rarefaction depth for diversity analysis
    rarefaction_depth  = 500


    errorStrategy  = "terminate"
    debug          = false // set to true if you'd like to see the values of your set parameters
}

// Setting the default container engine as singularity
params.containerEngine = "singularity"
// Conda shouldn't be used by default except when using conda-based profiles
params.use_conda = false


/*******************************************************************************************************
*************************************** Workflow Profiles **********************************************
********************************************************************************************************/

profiles {

    slurm {
        process.executor       = 'slurm'
    }

    conda {
        conda.enabled          = true
        params.use_conda       = true
        conda.channels         = 'conda-forge,bioconda'
        conda.cacheDir         = 'conda/' // location of conda environments
        conda.createTimeout    = '2h'
    }

    mamba {
        conda.enabled          = true
        conda.useMamba         = true
        conda.channels         = 'conda-forge,bioconda'
        params.use_conda       = true
        conda.cacheDir         = 'conda/' // location of conda environments
        conda.createTimeout    = '2h'
    }

    singularity {
        singularity.enabled    = true
        singularity.autoMounts = true

        /* Uncomment the line below  if you'd like to set the cache directory here,
         as setting it here takes precedence over setting the nextflow variable
        NXF_SINGULARITY_CACHEDIR=singularity/ in your run script
         */
        //singularity.cacheDir   = "singularity/" // location of singularity images
        params.containerEngine = "singularity"
    }

    docker {
        docker.enabled         = true
        docker.runOptions      = '-u $(id -u):$(id -g)'
        params.containerEngine = "docker"
    }

}

// Maximum number of jobs to submit in parallel
executor.queueSize = 20


/******************************************************************************************************************
***************** Tune process specific resources (cpu, container, memory etc.) ***********************************
*******************************************************************************************************************/

process {

    //******************* Default process settings ************************//
    errorStrategy = { params.errorStrategy ? params.errorStrategy : "ignore" }
    maxRetries = 2
    cpus = 2
    memory = "5 GB"
    cache = 'lenient'
  //debug = true  // uncomment to see what is being emitted to the standard output

//************************* Accession runsheet and input file retrieval  **************************************//
    withName: GET_RUNSHEET {
                  conda = {params.conda_dp_tools ? params.conda_dp_tools : "${projectDir}/envs/dp_tools.yaml"}
                  container = "quay.io/nasa_genelab/dp_tools:1.3.8"
                  publishDir = [path: params.genelab_dir, mode: params.publishDir_mode]
            }

//********************************** Raw read staging ********************************************//
    withName: COPY_READS {
                  maxRetries = 3
                  errorStrategy = 'retry'
                  publishDir = [path: params.raw_reads_dir, mode: params.publishDir_mode]
            }
    withName: 'COPY_REMOTE_READS|DOWNLOAD_DATABASE' {
                  conda = {params.conda_wget ? params.conda_wget : "${projectDir}/envs/wget.yaml"}
                  container = "quay.io/nasa_genelab/wget:1.21.4"
                  maxRetries = 3
                  maxForks = 5
                  errorStrategy = 'retry'
                  publishDir = [path: params.raw_reads_dir, mode: params.publishDir_mode]
    }

//********************************** Read quality control and assesment ********************************************//
    withLabel: fastqc {
                  conda = {params.conda_fastqc ? params.conda_fastqc : "${projectDir}/envs/fastqc.yaml"}
                  container = "quay.io/biocontainers/fastqc:0.12.1--hdfd78af_0"
            }

    withLabel: zip {
                  conda = {params.conda_zip ? params.conda_zip : "${projectDir}/envs/zip.yaml"}
                  container = "quay.io/nasa_genelab/zip:3.0"
           }

    withName: RAW_FASTQC {
                  publishDir = [path: params.raw_reads_dir, mode: params.publishDir_mode]
            }

    withName: "RAW_MULTIQC|FILTERED_MULTIQC" {
                  conda = {params.conda_multiqc ? params.conda_multiqc : "${projectDir}/envs/multiqc.yaml"}
                  container = "quay.io/biocontainers/multiqc:1.27.1--pyhdfd78af_0"
            }

    withName: "ZIP_MULTIQC_RAW|ZIP_MULTIQC_FILTERED" {
                  publishDir = [path: params.fastqc_out_dir, mode: params.publishDir_mode]
            }

    withName: "CUTADAPT|COMBINE_CUTADAPT_LOGS_AND_SUMMARIZE" {
                  conda = {params.conda_cutadapt ?  params.conda_cutadapt : "${projectDir}/envs/cutadapt.yaml"}
                  container = "quay.io/biocontainers/cutadapt:5.0--py39hbcbf7aa_0"
                  memory = "10 GB"
                  publishDir = [path: params.trimmed_reads_dir, mode: params.publishDir_mode]
            }

    withName: FILTERED_FASTQC {
                  publishDir = [path: params.filtered_reads_dir, mode: params.publishDir_mode ]
            }

//********************************** ASV table creation ********************************************//
    withName: "RUN_R_TRIM|RUN_R_NOTRIM" {
                  conda = {params.conda_R ?  params.conda_R : "${projectDir}/envs/R.yaml"}
                  container = "quay.io/nasa_genelab/r-dada-decipher-biomformat:1.1"
                  memory = "20 GB"
                  cpus = 10
                  publishDir = [[path: params.filtered_reads_dir, pattern: "Filtered_Sequence_Data/*",
                                mode: params.publishDir_mode, saveAs: { fn -> fn.substring(fn.lastIndexOf('/')+1) } ],
                                [path: params.final_outputs_dir , pattern: "final_outputs/*.{tsv,biom,fasta}",
                                mode: params.publishDir_mode, saveAs: { fn -> fn.substring(fn.lastIndexOf('/')+1)} ]]
          }

    withName: ZIP_BIOM {
                  publishDir = [path: "${params.final_outputs_dir}", mode: params.publishDir_mode]
            }

//********************************** Diversity and differential abundance testing ********************************************//
    withLabel: visualization {
                  conda = {params.conda_diversity ? params.conda_diversity : "${projectDir}/envs/diversity.yaml"}
                  container = "quay.io/nasa_genelab/r-diversity:1.1"
                  cpus = 5
                  memory = "10 GB"
                  publishDir = [path: "${params.final_outputs_dir}", mode: params.publishDir_mode]
           }

    withName: SOFTWARE_VERSIONS {
                  publishDir = [path: params.metadata_dir, mode: params.publishDir_mode]
            }

}


/*****************************************************************************
********************** Workflow Resource Usage Capturing *********************
******************************************************************************/

// Adapted from : https://github.com/nf-core/rnaseq/blob/master/nextflow.config
def trace_timestamp = new java.util.Date().format( 'yyyy-MM-dd_HH-mm-ss')
timeline {
    enabled = true
    file    = "${params.outdir}/Resource_Usage/execution_timeline_${trace_timestamp}.html"
}
report {
    enabled = true
    file    = "${params.outdir}/Resource_Usage/execution_report_${trace_timestamp}.html"
}
trace {
    enabled = true
    file    = "${params.outdir}/Resource_Usage/execution_trace_${trace_timestamp}.txt"
}


/******************************************************************************
**************************** Workflow Metadata ********************************
*******************************************************************************/

manifest {
    author = 'Olabiyi Aderemi Obayomi, Mike D. Lee'
    homePage = 'https://github.com/nasa/GeneLab_Data_Processing/blob/master/Amplicon/'
    description = 'Amplicon Illumina workflow for pipeline document GL-DPPD-7104-C'
    mainScript = 'main.nf'
    defaultBranch = 'main'
    nextflowVersion = '>=24.04.4'
    version = '1.0.8'
}