epi2me-labs
diff --git a/‎.gitlab-ci.yml‎
Lines changed: 52 additions & 5 deletions b/‎.gitlab-ci.yml‎
Lines changed: 52 additions & 5 deletions
diff --git a/‎CHANGELOG.md‎
Lines changed: 1 addition & 1 deletion b/‎CHANGELOG.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎README.md‎
Lines changed: 5 additions & 4 deletions b/‎README.md‎
Lines changed: 5 additions & 4 deletions
diff --git a/‎docs/06_input_parameters.md‎
Lines changed: 2 additions & 2 deletions b/‎docs/06_input_parameters.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎docs/07_outputs.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/07_outputs.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/09_troubleshooting.md‎
Lines changed: 1 addition & 0 deletions b/‎docs/09_troubleshooting.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/10_FAQ.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/10_FAQ.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎main.nf‎
Lines changed: 15 additions & 2 deletions b/‎main.nf‎
Lines changed: 15 additions & 2 deletions
@@ -13,7 +13,7 @@ variables:
     NF_WORKFLOW_OPTS: "--fastq ${CI_PROJECT_NAME}/data/wf-metagenomics-demo/test_data/ -executor.\\$$local.memory 16GB --database_set Standard-8"
     NF_PROCESS_FILES: >
       subworkflows/kraken_pipeline.nf
-    NF_IGNORE_PROCESSES: "rebatchFastq, output_kraken2_read_assignments"
+    NF_IGNORE_PROCESSES: "output_kraken2_read_assignments"
     CI_FLAVOUR: "new"
     CWG_AWS_ENV_NAME: "stack"
     PYTEST_CONTAINER_NAME: "wf-common"
@@ -23,7 +23,6 @@ variables:
 aws-run:
     variables:
         NF_WORKFLOW_OPTS: "--fastq test_data/case01 --store_dir s3://$${XAWS_BUCKET}/${CI_PROJECT_NAME}/store --database_set Standard-8"
-        NF_IGNORE_PROCESSES: "rebatchFastq"
     artifacts:
         when: always
         paths:
@@ -36,13 +35,11 @@ aws-run:
 singularity-run:
     variables:
         NF_WORKFLOW_OPTS: "--fastq test_data/case01 --database_set ncbi_16s_18s"
-        NF_IGNORE_PROCESSES: "rebatchFastq"
 
 ## This test avoids using the Standard-8GB. The use of this database is tested in aws-run
 macos-run:
     variables:
         NF_WORKFLOW_OPTS: "--fastq test_data/case01 --database_set ncbi_16s_18s"
-        NF_IGNORE_PROCESSES: "rebatchFastq"
 
 docker-run:
 
@@ -64,7 +61,11 @@ docker-run:
                 "minimap2-exclude-host", "kraken2-exclude-host", "minimap2-exclude-host-bam", "minimap2-exclude-host-empty-barcode",
                 "kraken2-bam", "minimap2-bam", "minimap2-igv",
                 "kraken2-real-time", "kraken2-real-time-bam", "amr-real-time",
-                "minimap2-split-prefix"
+                "minimap2-split-prefix",
+                "report-many-samples",
+                # test combinations of databases
+                "exception-kraken2-reference", "exception-minimap2-database",
+                "exception-minimap2-real-time", "exception-minimap2-igv-mmi-reference"
             ]
     rules:
      - if: ($CI_COMMIT_BRANCH == null || $CI_COMMIT_BRANCH == "dev-template")
@@ -250,4 +251,50 @@ docker-run:
             --database_set "ncbi_16s_18s"
             --read_limit 10000
           NF_IGNORE_PROCESSES: "concatAssignments,rebatchFastq,download_reference_ref2taxid,prepareSILVA,check_reference_ref2taxid,configure_igv,determine_bracken_length,download_unpack_taxonomy,unpack_download_kraken2_database"
+          ## Test real time simulating new fq files on the fly
+     - if: $MATRIX_NAME == "report-many-samples"
+       variables:
+          # simulate_samples.sh create the samples on the fly
+          NF_BEFORE_SCRIPT: "mkdir -p ${CI_PROJECT_NAME}/data/ && bash util/simulate_many_samples.sh ${CI_PROJECT_NAME}/data/ test_data/case06_amr/reads.fq &> ${CI_PROJECT_NAME}/data/simulate_many_samples.log &"
+          NF_PROCESS_FILES: >
+            subworkflows/minimap_pipeline.nf
+            lib/common.nf
+          NF_WORKFLOW_OPTS: >
+            --fastq="$CI_PROJECT_NAME/data/test_data/"
+            -executor.\$$local.memory 16GB
+            --database_set "ncbi_16s_18s"
+            --classifier minimap2
+            --minimap2_by_reference
+          NF_IGNORE_PROCESSES: "configure_igv,extractMinimap2Reads"
+      ## Test invalid combinations
+     - if: $MATRIX_NAME == "exception-kraken2-reference"
+       variables:
+          NF_BEFORE_SCRIPT: mkdir -p ${CI_PROJECT_NAME}/data/ && wget -q -O ${CI_PROJECT_NAME}/data/wf-metagenomics-demo.tar.gz https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-metagenomics/wf-metagenomics-demo.tar.gz && tar -xzvf ${CI_PROJECT_NAME}/data/wf-metagenomics-demo.tar.gz -C ${CI_PROJECT_NAME}/data/
+          NF_WORKFLOW_OPTS: >
+            --fastq test_data/case01
+            --reference "${CI_PROJECT_NAME}/data/wf-metagenomics-demo/reference/genomes.fna.gz"
+            --ref2taxid "${CI_PROJECT_NAME}/data/wf-metagenomics-demo/reference/ref2taxid.tsv"
+          ASSERT_NEXTFLOW_FAILURE: "yes"
+          ASSERT_NEXTFLOW_FAILURE_REXP : "To use kraken2 with your custom database, you need to use `--database` (instead of `--reference`) and include the `bracken_dist` within it."
+     - if: $MATRIX_NAME == "exception-minimap2-database"
+       variables:
+          NF_WORKFLOW_OPTS: "--fastq test_data/case01 --database data/ --classifier minimap2"
+          ASSERT_NEXTFLOW_FAILURE: "yes"
+          ASSERT_NEXTFLOW_FAILURE_REXP : "To use minimap2 with your custom database, you need to use `--reference` (instead of `--database`) and `--ref2taxid`."
+     - if: $MATRIX_NAME == "exception-minimap2-real-time"
+       variables:
+          NF_WORKFLOW_OPTS: "--fastq test_data/case01 --real_time --classifier minimap2"
+          ASSERT_NEXTFLOW_FAILURE: "yes"
+          ASSERT_NEXTFLOW_FAILURE_REXP : "Real time subworkflow must use kraken2 classifier."
+     - if: $MATRIX_NAME == "exception-minimap2-igv-mmi-reference"
+      ## Pass OPTIONAL_FILE as the goal is make the test fail
+       variables:
+          NF_WORKFLOW_OPTS: >
+            --fastq test_data/case01
+            --classifier minimap2
+            --reference "data/OPTIONAL_FILE"
+            --ref2taxid "${CI_PROJECT_NAME}/data/wf-metagenomics-demo/reference/ref2taxid.tsv"
+            --igv
+          ASSERT_NEXTFLOW_FAILURE: "yes"
+          ASSERT_NEXTFLOW_FAILURE_REXP : "The custom database reference must be a FASTA format file in order to view within IGV."
 
@@ -5,7 +5,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
 
-## [Unreleased]
+## [v2.11.1]
 ### Fixed
 - kraken2_client exits with `fastcat_histogram` usage error when using real time pipeline with `exclude_host` option.
 ### Changed
 
@@ -173,7 +173,7 @@ input_reads.fastq   ─── input_directory  ─── input_directory
 
 | Nextflow parameter name  | Type | Description | Help | Default |
 |--------------------------|------|-------------|------|---------|
-| database_set | string | Sets the reference, databases and taxonomy datasets that will be used for classifying reads. Choices: ['ncbi_16s_18s','ncbi_16s_18s_28s_ITS', 'SILVA_138_1', 'Standard-8', 'PlusPF-8', 'PlusPFP-8']. Memory requirement will be slightly higher than the size of the database. Standard-8, PlusPF-8 and PlusPFP-8 databases require more than 8GB. | This setting is overridable by providing an explicit taxonomy, database or reference path in the other reference options. | Standard-8 |
+| database_set | string | Sets the reference, databases and taxonomy datasets that will be used for classifying reads. Choices: ['ncbi_16s_18s','ncbi_16s_18s_28s_ITS', 'SILVA_138_1', 'Standard-8', 'PlusPF-8', 'PlusPFP-8']. Memory requirement will be slightly higher than the size of the database. Standard-8, PlusPF-8 and PlusPFP-8 databases require more than 8GB and are only available in the kraken2 approach. | This setting is overridable by providing an explicit taxonomy, database or reference path in the other reference options. | Standard-8 |
 | database | string | Not required but can be used to specifically override Kraken2 database [.tar.gz or Directory]. | By default uses database chosen in database_set parameter. |  |
 | taxonomy | string | Not required but can be used to specifically override taxonomy database. Change the default to use a different taxonomy file  [.tar.gz or directory]. | By default NCBI taxonomy file will be downloaded and used. |  |
 | reference | string | Override the FASTA reference file selected by the database_set parameter. It can be a FASTA format reference sequence collection or a minimap2 MMI format index. | This option should be used in conjunction with the database parameter to specify a custom database. |  |
@@ -225,7 +225,7 @@ input_reads.fastq   ─── input_directory  ─── input_directory
 | Nextflow parameter name  | Type | Description | Help | Default |
 |--------------------------|------|-------------|------|---------|
 | out_dir | string | Directory for output of all user-facing files. |  | output |
-| igv | boolean | Enable IGV visualisation in the EPI2ME Desktop Application by creating the required files. This will cause the workflow to emit the BAM files as well. |  | False |
+| igv | boolean | Enable IGV visualisation in the EPI2ME Desktop Application by creating the required files. This will cause the workflow to emit the BAM files as well. If using a custom reference, this must be a FASTA file and not a minimap2 MMI format index. |  | False |
 | include_read_assignments | boolean | A per sample TSV file that indicates the taxonomy assigned to each sequence. The TSV's will only be output on completion of the workflow and therefore not at all if using the real time option whilst running indefinitely. |  | False |
 
 
@@ -265,7 +265,7 @@ Output files may be aggregated including information for all samples or provided
 | Index of the reduced reference FASTA file | igv_reference/reduced_reference.fasta.gz.fai | Index of the reference FASTA file containing only those sequences that have reads mapped against them. | aggregated |
 | GZI index of the reduced reference FASTA file | igv_reference/reduced_reference.fasta.gz.gzi | Index of the reference FASTA file containing only those sequences that have reads mapped against them. | aggregated |
 | JSON configuration file for IGV browser | igv.json | JSON configuration file to be loaded in IGV for visualising alignments against the reduced reference. | aggregated |
-| Taxonomic assignment per read. | reads_assignments/{{ alias }}.{{kraken2|minimap2}}.assignments.tsv | TSV file with the taxonomic assignment per read. | per-sample |
+| Taxonomic assignment per read. | reads_assignments/{{ alias }}.*.assignments.tsv | TSV file with the taxonomic assignment per read. | per-sample |
 | FASTQ of the selected taxids. | extracted/{{ alias }}.minimap2.extracted.fastq | FASTQ containing/excluding the reads of the selected taxids. | per-sample |
 
 
@@ -414,6 +414,7 @@ The real-time subworkflow uses a server process to handle Kraken2 classification
 + See how to interpret some common nextflow exit codes [here](https://labs.epi2me.io/trouble-shooting/).
 + When using the Minimap2 pipeline with a custom database, you must make sure that the `ref2taxid` and reference files are coherent, as well as the taxonomy database.
 + If your device doesn't have the resources to use large Kraken2 databases (e.g. Standard-8, PlusPF-8 and PlusPFP-8), you can enable `kraken2_memory_mapping` to reduce the amount of memory required.
++ To enable IGV viewer with a custom reference, this must be a FASTA file and not a minimap2 MMI format index.
 
 
 
@@ -428,7 +429,7 @@ If your question is not answered here, please report any issues or suggestions o
     * 16S, 18S, ITS
         * ncbi_16s_18s and ncbi_16s_18s_28s_ITS:  Archaeal, bacterial and fungal 16S/18S and ITS data. There are two databases available using the data from [NCBI]https://www.ncbi.nlm.nih.gov/refseq/targetedloci/)
         * SILVA_138_1: The [SILVA](https://www.arb-silva.de/) database (version 138) is also available. Note that SILVA uses its own set of taxids, which do not match the NCBI taxids. We provide the respective taxdump files, but if you prefer using the NCBI ones, you can create them from the SILVA files ([NCBI](https://www.arb-silva.de/no_cache/download/archive/current/Exports/taxonomy/ncbi/)). As the SILVA database uses genus level, the last taxonomic rank at which the analysis is carried out is genus (`taxonomic_rank G`).
-    * General databases
+    * General databases (available only in kraken2 approaches)
         * Standard-8: It contains references for Archaea, Bacteria, viral, plasmid, human, UniVec_Core. To use this database the memory available to the workflow must be slightly higher than size of the database index (8GB).
         * PlusPF-8: It contains references for Archaea, Bacteria, viral, plasmid, human, UniVec_Core, protozoa and fungi. To use this database the memory available to the workflow must be slightly higher than size of the database index (8GB).
         * PlusPFP-8: It contains references for Archaea, Bacteria, viral, plasmid, human, UniVec_Core, protozoa, fungi and plant. To use this database the memory available to the workflow must be slightly higher than size of the database index (8GB).
 
@@ -35,7 +35,7 @@
 
 | Nextflow parameter name  | Type | Description | Help | Default |
 |--------------------------|------|-------------|------|---------|
-| database_set | string | Sets the reference, databases and taxonomy datasets that will be used for classifying reads. Choices: ['ncbi_16s_18s','ncbi_16s_18s_28s_ITS', 'SILVA_138_1', 'Standard-8', 'PlusPF-8', 'PlusPFP-8']. Memory requirement will be slightly higher than the size of the database. Standard-8, PlusPF-8 and PlusPFP-8 databases require more than 8GB. | This setting is overridable by providing an explicit taxonomy, database or reference path in the other reference options. | Standard-8 |
+| database_set | string | Sets the reference, databases and taxonomy datasets that will be used for classifying reads. Choices: ['ncbi_16s_18s','ncbi_16s_18s_28s_ITS', 'SILVA_138_1', 'Standard-8', 'PlusPF-8', 'PlusPFP-8']. Memory requirement will be slightly higher than the size of the database. Standard-8, PlusPF-8 and PlusPFP-8 databases require more than 8GB and are only available in the kraken2 approach. | This setting is overridable by providing an explicit taxonomy, database or reference path in the other reference options. | Standard-8 |
 | database | string | Not required but can be used to specifically override Kraken2 database [.tar.gz or Directory]. | By default uses database chosen in database_set parameter. |  |
 | taxonomy | string | Not required but can be used to specifically override taxonomy database. Change the default to use a different taxonomy file  [.tar.gz or directory]. | By default NCBI taxonomy file will be downloaded and used. |  |
 | reference | string | Override the FASTA reference file selected by the database_set parameter. It can be a FASTA format reference sequence collection or a minimap2 MMI format index. | This option should be used in conjunction with the database parameter to specify a custom database. |  |
@@ -87,7 +87,7 @@
 | Nextflow parameter name  | Type | Description | Help | Default |
 |--------------------------|------|-------------|------|---------|
 | out_dir | string | Directory for output of all user-facing files. |  | output |
-| igv | boolean | Enable IGV visualisation in the EPI2ME Desktop Application by creating the required files. This will cause the workflow to emit the BAM files as well. |  | False |
+| igv | boolean | Enable IGV visualisation in the EPI2ME Desktop Application by creating the required files. This will cause the workflow to emit the BAM files as well. If using a custom reference, this must be a FASTA file and not a minimap2 MMI format index. |  | False |
 | include_read_assignments | boolean | A per sample TSV file that indicates the taxonomy assigned to each sequence. The TSV's will only be output on completion of the workflow and therefore not at all if using the real time option whilst running indefinitely. |  | False |
 
 
 
@@ -18,5 +18,5 @@ Output files may be aggregated including information for all samples or provided
 | Index of the reduced reference FASTA file | igv_reference/reduced_reference.fasta.gz.fai | Index of the reference FASTA file containing only those sequences that have reads mapped against them. | aggregated |
 | GZI index of the reduced reference FASTA file | igv_reference/reduced_reference.fasta.gz.gzi | Index of the reference FASTA file containing only those sequences that have reads mapped against them. | aggregated |
 | JSON configuration file for IGV browser | igv.json | JSON configuration file to be loaded in IGV for visualising alignments against the reduced reference. | aggregated |
-| Taxonomic assignment per read. | reads_assignments/{{ alias }}.{{kraken2|minimap2}}.assignments.tsv | TSV file with the taxonomic assignment per read. | per-sample |
+| Taxonomic assignment per read. | reads_assignments/{{ alias }}.*.assignments.tsv | TSV file with the taxonomic assignment per read. | per-sample |
 | FASTQ of the selected taxids. | extracted/{{ alias }}.minimap2.extracted.fastq | FASTQ containing/excluding the reads of the selected taxids. | per-sample |
@@ -2,3 +2,4 @@
 + See how to interpret some common nextflow exit codes [here](https://labs.epi2me.io/trouble-shooting/).
 + When using the Minimap2 pipeline with a custom database, you must make sure that the `ref2taxid` and reference files are coherent, as well as the taxonomy database.
 + If your device doesn't have the resources to use large Kraken2 databases (e.g. Standard-8, PlusPF-8 and PlusPFP-8), you can enable `kraken2_memory_mapping` to reduce the amount of memory required.
++ To enable IGV viewer with a custom reference, this must be a FASTA file and not a minimap2 MMI format index.
@@ -6,7 +6,7 @@ If your question is not answered here, please report any issues or suggestions o
     * 16S, 18S, ITS
         * ncbi_16s_18s and ncbi_16s_18s_28s_ITS:  Archaeal, bacterial and fungal 16S/18S and ITS data. There are two databases available using the data from [NCBI]https://www.ncbi.nlm.nih.gov/refseq/targetedloci/)
         * SILVA_138_1: The [SILVA](https://www.arb-silva.de/) database (version 138) is also available. Note that SILVA uses its own set of taxids, which do not match the NCBI taxids. We provide the respective taxdump files, but if you prefer using the NCBI ones, you can create them from the SILVA files ([NCBI](https://www.arb-silva.de/no_cache/download/archive/current/Exports/taxonomy/ncbi/)). As the SILVA database uses genus level, the last taxonomic rank at which the analysis is carried out is genus (`taxonomic_rank G`).
-    * General databases
+    * General databases (available only in kraken2 approaches)
         * Standard-8: It contains references for Archaea, Bacteria, viral, plasmid, human, UniVec_Core. To use this database the memory available to the workflow must be slightly higher than size of the database index (8GB).
         * PlusPF-8: It contains references for Archaea, Bacteria, viral, plasmid, human, UniVec_Core, protozoa and fungi. To use this database the memory available to the workflow must be slightly higher than size of the database index (8GB).
         * PlusPFP-8: It contains references for Archaea, Bacteria, viral, plasmid, human, UniVec_Core, protozoa, fungi and plant. To use this database the memory available to the workflow must be slightly higher than size of the database index (8GB).
 
@@ -45,6 +45,18 @@ workflow {
     if (params.classifier == 'minimap2' && params.database) {
         throw new Exception("To use minimap2 with your custom database, you need to use `--reference` (instead of `--database`) and `--ref2taxid`.")
     }
+
+    boolean output_igv = params.igv
+    if (params.classifier == 'minimap2' && params.reference && params.igv) {
+        ArrayList ref_exts = [".fa", ".fa.gz", ".fasta", ".fasta.gz", ".fna", ".fna.gz"]
+        if (! ref_exts.any { ext -> file(params.reference).name.endsWith(ext) }) {
+            output_igv = false
+            log.info("The custom database reference must be a FASTA format file in order to view within IGV.")
+        } else {
+            output_igv=true
+        }
+    }
+
     if ((params.classifier == 'kraken2' || params.real_time ) && params.reference) {
         throw new Exception("To use kraken2 with your custom database, you need to use `--database` (instead of `--reference`) and include the `bracken_dist` within it.")
     }
@@ -69,7 +81,7 @@ workflow {
             log.info("Note: Or consider to use the --kraken2_memory_mapping.")
         }
 
-    } 
+    }
     if(params.taxonomy){
         // this can be useful if the user wants to use a new taxonomy database (maybe updated) but the default reference.
         source_name = params.database_set
@@ -167,7 +179,8 @@ workflow {
             databases_minimap2.taxonomy,
             databases_minimap2.taxonomic_rank,
             common_minimap2_opts,
-            keep_bam
+            keep_bam,
+            output_igv
             )
     }