Skip to content

Commit 9ed040f

Browse files
committed
Merge branch 'CW-5053' into 'dev'
Add exceptions, improve docs [CW-5053] See merge request epi2melabs/workflows/wf-metagenomics!218
2 parents 72bd623 + c616bc5 commit 9ed040f

13 files changed

+121
-22
lines changed

.gitlab-ci.yml

Lines changed: 52 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ variables:
1313
NF_WORKFLOW_OPTS: "--fastq ${CI_PROJECT_NAME}/data/wf-metagenomics-demo/test_data/ -executor.\\$$local.memory 16GB --database_set Standard-8"
1414
NF_PROCESS_FILES: >
1515
subworkflows/kraken_pipeline.nf
16-
NF_IGNORE_PROCESSES: "rebatchFastq, output_kraken2_read_assignments"
16+
NF_IGNORE_PROCESSES: "output_kraken2_read_assignments"
1717
CI_FLAVOUR: "new"
1818
CWG_AWS_ENV_NAME: "stack"
1919
PYTEST_CONTAINER_NAME: "wf-common"
@@ -23,7 +23,6 @@ variables:
2323
aws-run:
2424
variables:
2525
NF_WORKFLOW_OPTS: "--fastq test_data/case01 --store_dir s3://$${XAWS_BUCKET}/${CI_PROJECT_NAME}/store --database_set Standard-8"
26-
NF_IGNORE_PROCESSES: "rebatchFastq"
2726
artifacts:
2827
when: always
2928
paths:
@@ -36,13 +35,11 @@ aws-run:
3635
singularity-run:
3736
variables:
3837
NF_WORKFLOW_OPTS: "--fastq test_data/case01 --database_set ncbi_16s_18s"
39-
NF_IGNORE_PROCESSES: "rebatchFastq"
4038

4139
## This test avoids using the Standard-8GB. The use of this database is tested in aws-run
4240
macos-run:
4341
variables:
4442
NF_WORKFLOW_OPTS: "--fastq test_data/case01 --database_set ncbi_16s_18s"
45-
NF_IGNORE_PROCESSES: "rebatchFastq"
4643

4744
docker-run:
4845

@@ -64,7 +61,11 @@ docker-run:
6461
"minimap2-exclude-host", "kraken2-exclude-host", "minimap2-exclude-host-bam", "minimap2-exclude-host-empty-barcode",
6562
"kraken2-bam", "minimap2-bam", "minimap2-igv",
6663
"kraken2-real-time", "kraken2-real-time-bam", "amr-real-time",
67-
"minimap2-split-prefix"
64+
"minimap2-split-prefix",
65+
"report-many-samples",
66+
# test combinations of databases
67+
"exception-kraken2-reference", "exception-minimap2-database",
68+
"exception-minimap2-real-time", "exception-minimap2-igv-mmi-reference"
6869
]
6970
rules:
7071
- if: ($CI_COMMIT_BRANCH == null || $CI_COMMIT_BRANCH == "dev-template")
@@ -250,4 +251,50 @@ docker-run:
250251
--database_set "ncbi_16s_18s"
251252
--read_limit 10000
252253
NF_IGNORE_PROCESSES: "concatAssignments,rebatchFastq,download_reference_ref2taxid,prepareSILVA,check_reference_ref2taxid,configure_igv,determine_bracken_length,download_unpack_taxonomy,unpack_download_kraken2_database"
254+
## Test real time simulating new fq files on the fly
255+
- if: $MATRIX_NAME == "report-many-samples"
256+
variables:
257+
# simulate_samples.sh create the samples on the fly
258+
NF_BEFORE_SCRIPT: "mkdir -p ${CI_PROJECT_NAME}/data/ && bash util/simulate_many_samples.sh ${CI_PROJECT_NAME}/data/ test_data/case06_amr/reads.fq &> ${CI_PROJECT_NAME}/data/simulate_many_samples.log &"
259+
NF_PROCESS_FILES: >
260+
subworkflows/minimap_pipeline.nf
261+
lib/common.nf
262+
NF_WORKFLOW_OPTS: >
263+
--fastq="$CI_PROJECT_NAME/data/test_data/"
264+
-executor.\$$local.memory 16GB
265+
--database_set "ncbi_16s_18s"
266+
--classifier minimap2
267+
--minimap2_by_reference
268+
NF_IGNORE_PROCESSES: "configure_igv,extractMinimap2Reads"
269+
## Test invalid combinations
270+
- if: $MATRIX_NAME == "exception-kraken2-reference"
271+
variables:
272+
NF_BEFORE_SCRIPT: mkdir -p ${CI_PROJECT_NAME}/data/ && wget -q -O ${CI_PROJECT_NAME}/data/wf-metagenomics-demo.tar.gz https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-metagenomics/wf-metagenomics-demo.tar.gz && tar -xzvf ${CI_PROJECT_NAME}/data/wf-metagenomics-demo.tar.gz -C ${CI_PROJECT_NAME}/data/
273+
NF_WORKFLOW_OPTS: >
274+
--fastq test_data/case01
275+
--reference "${CI_PROJECT_NAME}/data/wf-metagenomics-demo/reference/genomes.fna.gz"
276+
--ref2taxid "${CI_PROJECT_NAME}/data/wf-metagenomics-demo/reference/ref2taxid.tsv"
277+
ASSERT_NEXTFLOW_FAILURE: "yes"
278+
ASSERT_NEXTFLOW_FAILURE_REXP : "To use kraken2 with your custom database, you need to use `--database` (instead of `--reference`) and include the `bracken_dist` within it."
279+
- if: $MATRIX_NAME == "exception-minimap2-database"
280+
variables:
281+
NF_WORKFLOW_OPTS: "--fastq test_data/case01 --database data/ --classifier minimap2"
282+
ASSERT_NEXTFLOW_FAILURE: "yes"
283+
ASSERT_NEXTFLOW_FAILURE_REXP : "To use minimap2 with your custom database, you need to use `--reference` (instead of `--database`) and `--ref2taxid`."
284+
- if: $MATRIX_NAME == "exception-minimap2-real-time"
285+
variables:
286+
NF_WORKFLOW_OPTS: "--fastq test_data/case01 --real_time --classifier minimap2"
287+
ASSERT_NEXTFLOW_FAILURE: "yes"
288+
ASSERT_NEXTFLOW_FAILURE_REXP : "Real time subworkflow must use kraken2 classifier."
289+
- if: $MATRIX_NAME == "exception-minimap2-igv-mmi-reference"
290+
## Pass OPTIONAL_FILE as the goal is make the test fail
291+
variables:
292+
NF_WORKFLOW_OPTS: >
293+
--fastq test_data/case01
294+
--classifier minimap2
295+
--reference "data/OPTIONAL_FILE"
296+
--ref2taxid "${CI_PROJECT_NAME}/data/wf-metagenomics-demo/reference/ref2taxid.tsv"
297+
--igv
298+
ASSERT_NEXTFLOW_FAILURE: "yes"
299+
ASSERT_NEXTFLOW_FAILURE_REXP : "The custom database reference must be a FASTA format file in order to view within IGV."
253300

CHANGELOG.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
55
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
66

77

8-
## [Unreleased]
8+
## [v2.11.1]
99
### Fixed
1010
- kraken2_client exits with `fastcat_histogram` usage error when using real time pipeline with `exclude_host` option.
1111
### Changed

README.md

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -173,7 +173,7 @@ input_reads.fastq ─── input_directory ─── input_directory
173173

174174
| Nextflow parameter name | Type | Description | Help | Default |
175175
|--------------------------|------|-------------|------|---------|
176-
| database_set | string | Sets the reference, databases and taxonomy datasets that will be used for classifying reads. Choices: ['ncbi_16s_18s','ncbi_16s_18s_28s_ITS', 'SILVA_138_1', 'Standard-8', 'PlusPF-8', 'PlusPFP-8']. Memory requirement will be slightly higher than the size of the database. Standard-8, PlusPF-8 and PlusPFP-8 databases require more than 8GB. | This setting is overridable by providing an explicit taxonomy, database or reference path in the other reference options. | Standard-8 |
176+
| database_set | string | Sets the reference, databases and taxonomy datasets that will be used for classifying reads. Choices: ['ncbi_16s_18s','ncbi_16s_18s_28s_ITS', 'SILVA_138_1', 'Standard-8', 'PlusPF-8', 'PlusPFP-8']. Memory requirement will be slightly higher than the size of the database. Standard-8, PlusPF-8 and PlusPFP-8 databases require more than 8GB and are only available in the kraken2 approach. | This setting is overridable by providing an explicit taxonomy, database or reference path in the other reference options. | Standard-8 |
177177
| database | string | Not required but can be used to specifically override Kraken2 database [.tar.gz or Directory]. | By default uses database chosen in database_set parameter. | |
178178
| taxonomy | string | Not required but can be used to specifically override taxonomy database. Change the default to use a different taxonomy file [.tar.gz or directory]. | By default NCBI taxonomy file will be downloaded and used. | |
179179
| reference | string | Override the FASTA reference file selected by the database_set parameter. It can be a FASTA format reference sequence collection or a minimap2 MMI format index. | This option should be used in conjunction with the database parameter to specify a custom database. | |
@@ -225,7 +225,7 @@ input_reads.fastq ─── input_directory ─── input_directory
225225
| Nextflow parameter name | Type | Description | Help | Default |
226226
|--------------------------|------|-------------|------|---------|
227227
| out_dir | string | Directory for output of all user-facing files. | | output |
228-
| igv | boolean | Enable IGV visualisation in the EPI2ME Desktop Application by creating the required files. This will cause the workflow to emit the BAM files as well. | | False |
228+
| igv | boolean | Enable IGV visualisation in the EPI2ME Desktop Application by creating the required files. This will cause the workflow to emit the BAM files as well. If using a custom reference, this must be a FASTA file and not a minimap2 MMI format index. | | False |
229229
| include_read_assignments | boolean | A per sample TSV file that indicates the taxonomy assigned to each sequence. The TSV's will only be output on completion of the workflow and therefore not at all if using the real time option whilst running indefinitely. | | False |
230230

231231

@@ -265,7 +265,7 @@ Output files may be aggregated including information for all samples or provided
265265
| Index of the reduced reference FASTA file | igv_reference/reduced_reference.fasta.gz.fai | Index of the reference FASTA file containing only those sequences that have reads mapped against them. | aggregated |
266266
| GZI index of the reduced reference FASTA file | igv_reference/reduced_reference.fasta.gz.gzi | Index of the reference FASTA file containing only those sequences that have reads mapped against them. | aggregated |
267267
| JSON configuration file for IGV browser | igv.json | JSON configuration file to be loaded in IGV for visualising alignments against the reduced reference. | aggregated |
268-
| Taxonomic assignment per read. | reads_assignments/{{ alias }}.{{kraken2|minimap2}}.assignments.tsv | TSV file with the taxonomic assignment per read. | per-sample |
268+
| Taxonomic assignment per read. | reads_assignments/{{ alias }}.*.assignments.tsv | TSV file with the taxonomic assignment per read. | per-sample |
269269
| FASTQ of the selected taxids. | extracted/{{ alias }}.minimap2.extracted.fastq | FASTQ containing/excluding the reads of the selected taxids. | per-sample |
270270

271271

@@ -414,6 +414,7 @@ The real-time subworkflow uses a server process to handle Kraken2 classification
414414
+ See how to interpret some common nextflow exit codes [here](https://labs.epi2me.io/trouble-shooting/).
415415
+ When using the Minimap2 pipeline with a custom database, you must make sure that the `ref2taxid` and reference files are coherent, as well as the taxonomy database.
416416
+ If your device doesn't have the resources to use large Kraken2 databases (e.g. Standard-8, PlusPF-8 and PlusPFP-8), you can enable `kraken2_memory_mapping` to reduce the amount of memory required.
417+
+ To enable IGV viewer with a custom reference, this must be a FASTA file and not a minimap2 MMI format index.
417418

418419

419420

@@ -428,7 +429,7 @@ If your question is not answered here, please report any issues or suggestions o
428429
* 16S, 18S, ITS
429430
* ncbi_16s_18s and ncbi_16s_18s_28s_ITS: Archaeal, bacterial and fungal 16S/18S and ITS data. There are two databases available using the data from [NCBI]https://www.ncbi.nlm.nih.gov/refseq/targetedloci/)
430431
* SILVA_138_1: The [SILVA](https://www.arb-silva.de/) database (version 138) is also available. Note that SILVA uses its own set of taxids, which do not match the NCBI taxids. We provide the respective taxdump files, but if you prefer using the NCBI ones, you can create them from the SILVA files ([NCBI](https://www.arb-silva.de/no_cache/download/archive/current/Exports/taxonomy/ncbi/)). As the SILVA database uses genus level, the last taxonomic rank at which the analysis is carried out is genus (`taxonomic_rank G`).
431-
* General databases
432+
* General databases (available only in kraken2 approaches)
432433
* Standard-8: It contains references for Archaea, Bacteria, viral, plasmid, human, UniVec_Core. To use this database the memory available to the workflow must be slightly higher than size of the database index (8GB).
433434
* PlusPF-8: It contains references for Archaea, Bacteria, viral, plasmid, human, UniVec_Core, protozoa and fungi. To use this database the memory available to the workflow must be slightly higher than size of the database index (8GB).
434435
* PlusPFP-8: It contains references for Archaea, Bacteria, viral, plasmid, human, UniVec_Core, protozoa, fungi and plant. To use this database the memory available to the workflow must be slightly higher than size of the database index (8GB).

docs/06_input_parameters.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@
3535

3636
| Nextflow parameter name | Type | Description | Help | Default |
3737
|--------------------------|------|-------------|------|---------|
38-
| database_set | string | Sets the reference, databases and taxonomy datasets that will be used for classifying reads. Choices: ['ncbi_16s_18s','ncbi_16s_18s_28s_ITS', 'SILVA_138_1', 'Standard-8', 'PlusPF-8', 'PlusPFP-8']. Memory requirement will be slightly higher than the size of the database. Standard-8, PlusPF-8 and PlusPFP-8 databases require more than 8GB. | This setting is overridable by providing an explicit taxonomy, database or reference path in the other reference options. | Standard-8 |
38+
| database_set | string | Sets the reference, databases and taxonomy datasets that will be used for classifying reads. Choices: ['ncbi_16s_18s','ncbi_16s_18s_28s_ITS', 'SILVA_138_1', 'Standard-8', 'PlusPF-8', 'PlusPFP-8']. Memory requirement will be slightly higher than the size of the database. Standard-8, PlusPF-8 and PlusPFP-8 databases require more than 8GB and are only available in the kraken2 approach. | This setting is overridable by providing an explicit taxonomy, database or reference path in the other reference options. | Standard-8 |
3939
| database | string | Not required but can be used to specifically override Kraken2 database [.tar.gz or Directory]. | By default uses database chosen in database_set parameter. | |
4040
| taxonomy | string | Not required but can be used to specifically override taxonomy database. Change the default to use a different taxonomy file [.tar.gz or directory]. | By default NCBI taxonomy file will be downloaded and used. | |
4141
| reference | string | Override the FASTA reference file selected by the database_set parameter. It can be a FASTA format reference sequence collection or a minimap2 MMI format index. | This option should be used in conjunction with the database parameter to specify a custom database. | |
@@ -87,7 +87,7 @@
8787
| Nextflow parameter name | Type | Description | Help | Default |
8888
|--------------------------|------|-------------|------|---------|
8989
| out_dir | string | Directory for output of all user-facing files. | | output |
90-
| igv | boolean | Enable IGV visualisation in the EPI2ME Desktop Application by creating the required files. This will cause the workflow to emit the BAM files as well. | | False |
90+
| igv | boolean | Enable IGV visualisation in the EPI2ME Desktop Application by creating the required files. This will cause the workflow to emit the BAM files as well. If using a custom reference, this must be a FASTA file and not a minimap2 MMI format index. | | False |
9191
| include_read_assignments | boolean | A per sample TSV file that indicates the taxonomy assigned to each sequence. The TSV's will only be output on completion of the workflow and therefore not at all if using the real time option whilst running indefinitely. | | False |
9292

9393

docs/07_outputs.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,5 +18,5 @@ Output files may be aggregated including information for all samples or provided
1818
| Index of the reduced reference FASTA file | igv_reference/reduced_reference.fasta.gz.fai | Index of the reference FASTA file containing only those sequences that have reads mapped against them. | aggregated |
1919
| GZI index of the reduced reference FASTA file | igv_reference/reduced_reference.fasta.gz.gzi | Index of the reference FASTA file containing only those sequences that have reads mapped against them. | aggregated |
2020
| JSON configuration file for IGV browser | igv.json | JSON configuration file to be loaded in IGV for visualising alignments against the reduced reference. | aggregated |
21-
| Taxonomic assignment per read. | reads_assignments/{{ alias }}.{{kraken2|minimap2}}.assignments.tsv | TSV file with the taxonomic assignment per read. | per-sample |
21+
| Taxonomic assignment per read. | reads_assignments/{{ alias }}.*.assignments.tsv | TSV file with the taxonomic assignment per read. | per-sample |
2222
| FASTQ of the selected taxids. | extracted/{{ alias }}.minimap2.extracted.fastq | FASTQ containing/excluding the reads of the selected taxids. | per-sample |

docs/09_troubleshooting.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,3 +2,4 @@
22
+ See how to interpret some common nextflow exit codes [here](https://labs.epi2me.io/trouble-shooting/).
33
+ When using the Minimap2 pipeline with a custom database, you must make sure that the `ref2taxid` and reference files are coherent, as well as the taxonomy database.
44
+ If your device doesn't have the resources to use large Kraken2 databases (e.g. Standard-8, PlusPF-8 and PlusPFP-8), you can enable `kraken2_memory_mapping` to reduce the amount of memory required.
5+
+ To enable IGV viewer with a custom reference, this must be a FASTA file and not a minimap2 MMI format index.

docs/10_FAQ.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ If your question is not answered here, please report any issues or suggestions o
66
* 16S, 18S, ITS
77
* ncbi_16s_18s and ncbi_16s_18s_28s_ITS: Archaeal, bacterial and fungal 16S/18S and ITS data. There are two databases available using the data from [NCBI]https://www.ncbi.nlm.nih.gov/refseq/targetedloci/)
88
* SILVA_138_1: The [SILVA](https://www.arb-silva.de/) database (version 138) is also available. Note that SILVA uses its own set of taxids, which do not match the NCBI taxids. We provide the respective taxdump files, but if you prefer using the NCBI ones, you can create them from the SILVA files ([NCBI](https://www.arb-silva.de/no_cache/download/archive/current/Exports/taxonomy/ncbi/)). As the SILVA database uses genus level, the last taxonomic rank at which the analysis is carried out is genus (`taxonomic_rank G`).
9-
* General databases
9+
* General databases (available only in kraken2 approaches)
1010
* Standard-8: It contains references for Archaea, Bacteria, viral, plasmid, human, UniVec_Core. To use this database the memory available to the workflow must be slightly higher than size of the database index (8GB).
1111
* PlusPF-8: It contains references for Archaea, Bacteria, viral, plasmid, human, UniVec_Core, protozoa and fungi. To use this database the memory available to the workflow must be slightly higher than size of the database index (8GB).
1212
* PlusPFP-8: It contains references for Archaea, Bacteria, viral, plasmid, human, UniVec_Core, protozoa, fungi and plant. To use this database the memory available to the workflow must be slightly higher than size of the database index (8GB).

main.nf

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,18 @@ workflow {
4545
if (params.classifier == 'minimap2' && params.database) {
4646
throw new Exception("To use minimap2 with your custom database, you need to use `--reference` (instead of `--database`) and `--ref2taxid`.")
4747
}
48+
49+
boolean output_igv = params.igv
50+
if (params.classifier == 'minimap2' && params.reference && params.igv) {
51+
ArrayList ref_exts = [".fa", ".fa.gz", ".fasta", ".fasta.gz", ".fna", ".fna.gz"]
52+
if (! ref_exts.any { ext -> file(params.reference).name.endsWith(ext) }) {
53+
output_igv = false
54+
log.info("The custom database reference must be a FASTA format file in order to view within IGV.")
55+
} else {
56+
output_igv=true
57+
}
58+
}
59+
4860
if ((params.classifier == 'kraken2' || params.real_time ) && params.reference) {
4961
throw new Exception("To use kraken2 with your custom database, you need to use `--database` (instead of `--reference`) and include the `bracken_dist` within it.")
5062
}
@@ -69,7 +81,7 @@ workflow {
6981
log.info("Note: Or consider to use the --kraken2_memory_mapping.")
7082
}
7183

72-
}
84+
}
7385
if(params.taxonomy){
7486
// this can be useful if the user wants to use a new taxonomy database (maybe updated) but the default reference.
7587
source_name = params.database_set
@@ -167,7 +179,8 @@ workflow {
167179
databases_minimap2.taxonomy,
168180
databases_minimap2.taxonomic_rank,
169181
common_minimap2_opts,
170-
keep_bam
182+
keep_bam,
183+
output_igv
171184
)
172185
}
173186

0 commit comments

Comments
 (0)