nf-core · tracelail · Mar 26, 2025 · Mar 26, 2025 · Mar 26, 2025 · May 21, 2025
diff --git a/.nf-test.log b/.nf-test.log
@@ -0,0 +1,9 @@
+Jul-09 09:36:15.278 [main] INFO  com.askimed.nf.test.App - nf-test 0.9.2
+Jul-09 09:36:15.294 [main] INFO  com.askimed.nf.test.App - Arguments: [test, subworkflows/local/diamond/tests/main.nf.tests]
+Jul-09 09:36:16.153 [main] INFO  com.askimed.nf.test.App - Nextflow Version: 24.10.6
+Jul-09 09:36:16.155 [main] INFO  com.askimed.nf.test.commands.RunTestsCommand - Load config from file /home/trace/projects/proteinannotator/nf-test.config...
+Jul-09 09:36:16.663 [main] WARN  com.askimed.nf.test.nextflow.NextflowScript - Module /home/trace/projects/proteinannotator/subworkflows/local/functional_annotation/main.nf: Dependency '/home/trace/projects/proteinannotator/subworkflows/local/functional_annotation/../../../modules/nf-core/blast/makeblastdb/main.nf' not found.
+Jul-09 09:36:16.728 [main] INFO  com.askimed.nf.test.lang.dependencies.DependencyResolver - Loaded 21 files from directory /home/trace/projects/proteinannotator in 0.081 sec
+Jul-09 09:36:16.730 [main] INFO  com.askimed.nf.test.lang.dependencies.DependencyResolver - Found 0 files containing tests.
+Jul-09 09:36:16.730 [main] DEBUG com.askimed.nf.test.lang.dependencies.DependencyResolver - Found files: []
+Jul-09 09:36:16.732 [main] INFO  com.askimed.nf.test.commands.RunTestsCommand - Found 0 tests to execute.
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -1,3 +1,6 @@
 {
-    "markdown.styles": ["public/vscode_markdown.css"]
+    "markdown.styles": [
+        "public/vscode_markdown.css"
+    ],
+    "nextflow.telemetry.enabled": true
 }
diff --git a/CITATIONS.md b/CITATIONS.md
@@ -10,6 +10,14 @@
 
 ## Pipeline tools
 
+- [FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/)
+
+> Andrews, S. (2010). FastQC: A Quality Control Tool for High Throughput Sequence Data [Online].
+
+- [DIAMOND](https://github.com/bbuchfink/diamond)
+
+> Buchfink B, Xie C, Huson DH, "Fast and sensitive protein alignment using DIAMOND", Nature Methods 12, 59-60 (2015). doi:10.1038/nmeth.3176
+
 - [MultiQC](https://pubmed.ncbi.nlm.nih.gov/27312411/)
 
 > Ewels P, Magnusson M, Lundin S, Käller M. MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics. 2016 Oct 1;32(19):3047-8. doi: 10.1093/bioinformatics/btw354. Epub 2016 Jun 16. PubMed PMID: 27312411; PubMed Central PMCID: PMC5039924.

diff --git a/README.md b/README.md
@@ -33,11 +33,15 @@
      workflows use the "tube map" design for that. See https://nf-co.re/docs/contributing/design_guidelines#examples for examples.   -->
 <!-- TODO nf-core: Fill in short bullet-pointed list of the default steps in the pipeline -->
 
+
+
 1. Run ([`seqkit stats`](https://bioinf.shenwei.me/seqkit/usage/#stats)) to summarize input protein fasta files
 2. Functional Annotation:
    1. ([`InterProScan`](https://interproscan-docs.readthedocs.io/en/v5/)) a software tool used to analyze protein sequences by scanning them against the signatures of protein families, domains, and sites in the [InterPro](https://www.ebi.ac.uk/interpro/) database, helping to identify their functional characteristics.
+   2. ([`DIAMOND`](https://github.com/bbuchfink/diamond))
 3. Present QC for raw reads ([`MultiQC`](http://multiqc.info/))
 
+
 ![Protein annotator metromap. Protein fasta files are summarized with `seqkit stats`, then functionally annotated with InterProScan, DIAMOND-blastp, UniFire, and Kmerseek](assets/proteinannotator-metromap.excalidraw.png)
 
 ## Usage

diff --git a/docs/output.md b/docs/output.md
@@ -14,6 +14,7 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d
 
 - [Functional Annotation](#functional-annotation) Annotate proteins with functional domains
   - [InterProScan](#Interproscan) - Search the InterPro database for functional domains
+  - [Diamond] (#Diamond) - Provide ‘hits’ of potential homologous protein matches between species
 - [MultiQC](#multiqc) - Aggregate report describing results and QC from the whole pipeline
 - [SeqKit stats](#seqkit_stats) - Simple statistics for protein FASTA files
 - [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution
@@ -75,7 +76,7 @@ AKRLERIETINREIIDMAGGAGSSNGTGGMLTKIKAATIATESGVPVYICS
 
 </details>
 
-#### JavaScript Object Notation (JSON) Output
+##### JavaScript Object Notation (JSON) Output
 
 JSON representation of the matches - an alternative to XML format. As new releases are made public, the changes to the expected JSON format are documented in [Change log for InterProScan JSON output format](https://interproscan-docs.readthedocs.io/en/v5/JSONOutputFormatHistory.html#change-log-for-interproscan-json-output-format).
 
@@ -268,6 +269,115 @@ The XML Schema Definition (XSD) is available [here](http://ftp.ebi.ac.uk/pub/sof
 
 </details>
 
+#### Diamond
+
+<details markdown="1">
+<summary>Output files</summary>
+
+- `functional_annotation/diamond`
+  - `*.blast`: (Basic Local Alignment Search Tool) BLAST pairwise format
+  - `*.xml`: BLAST Extensible Markup Language (XML) format
+  - `*.txt`: BLAST tabular format (default). This format can be customized, the 6 may be followed by a space-separated list of the blast_columns keywords, each specifying a field of the output. 
+  - `*.daa`: DIAMOND alignment archive (DAA). The DAA format is a proprietary binary format that can subsequently be used to generate other output formats using the view command. It is also supported by MEGAN and allows a quick import of results. 
+  - `*.sam`: SAM format. 
+  - `*.tsv`: Taxonomic classification. This format will not print alignments but only a taxonomic classification for each query using the LCA algorithm. 
+  - `*.paf`: PAF format. The custom fields in the format are AS (bit score), ZR (raw score) and ZE (e-value)
+
+</details>
+
+[Diamond](https://github.com/bbuchfink/diamond) provides sensitive protein sequence alignment. The process provides ‘hits’ that are potential homologous protein matches between species, indicating a evolutionary relationship, derived by protein sequence similarity.
+
+##### Pairwise Alignment Format (.blast) Output
+
+The pairwise BLAST format is a human readable format that is useful for visual inspection, if one desires to get full alignment details for individual alignments.
+
+<details markdown="1">
+<summary>Example Pairwise Alignment Format output</summary>
+
+```
+
+```
+
+</details>
+
+##### BLAST Extensible Markup Language (XML) Output
+
+XML (Extensible Markup Language) file has the same information as the pairwise file but is suited for bioinformatics software and scripts (machine readable), due to it’s structure and parsing of data.
+
+<details markdown="1">
+<summary>Example Extensible Markup Language (XML) output</summary>
+
+```
+
+```
+
+</details>
+
+##### Text File (TXT) Output --default
+
+The BLAST tabular format is the default output and the output columns can be modified depending on analysis needs. This format is much smaller than the other BLAST formats and compatible with most all forward processing and is easily filtered and analyzed.
+
+<details markdown="1">
+<summary>Example Text File (TXT) output</summary>
+
+```
+
+```
+
+</details>
+
+##### DIAMOND Alignment Archive (DAA) Output
+
+DIAMOND alignment archive (DAA) is a compressed proprietary binary format that is can be converted to any of the other output formats (.blast, .xml, .txt, .sam, .tsv, .paf) with the DIAMOND view command without rerunning the pipeline. It can also be used in some meta-genomic analysis software. 
+
+<details markdown="1">
+<summary>Example DIAMOND Alignment Archive (DAA) output</summary>
+
+```
+
+```
+
+</details>
+
+##### Sequence Alignment/Map (SAM) Output
+
+The SAM (Sequence Alignment/Map) file adapts the DIAMOND protein alignment output in a similar fashion to the genomic alignment. This allows for easy integration into SAM/BAM pipelines and protein alignment visualization with IGV browser.
+
+<details markdown="1">
+<summary>Example Sequence Alignment/Map (SAM) output</summary>
+
+```
+
+```
+
+</details>
+
+##### Tab-Separated Values (TSV) Output
+
+The taxonomic classification (.tsv) output provides taxonomic composition and is useful for biological interpretation rather than alignment comparison.
+
+<details markdown="1">
+<summary>Example Tab-Separated Values (TSV) output</summary>
+
+```
+
+```
+
+</details>
+
+##### Pairwise Mapping Format (PAF)
+
+The PAF (Pairwise mApping Format) file that is originally used for long read sequencing. DIAMOND adds three additional variables, AS (bit score), ZR (raw alignment score), and ZE (E-value), to provide statistical evidence for protein alignment. This format is useful if one is looking for positional information and statistical significance. 
+
+<details markdown="1">
+<summary>Example InterProScan GFF output</summary>
+
+```
+
+```
+
+</details>
+
 ### MultiQC
 
 <details markdown="1">

diff --git a/modules.json b/modules.json
@@ -5,24 +5,30 @@
         "https://github.com/nf-core/modules.git": {
             "modules": {
                 "nf-core": {
-                    "mmseqs/search": {
+                    "diamond/blastp": {
                         "branch": "master",
-                        "git_sha": "81880787133db07d9b4c1febd152c090eb8325dc",
-                        "installed_by": ["modules"]
+                        "git_sha": "05954dab2ff481bcb999f24455da29a5828af08d",
+                        "installed_by": [
+                            "modules"
+                        ]
                     },
-                    "mtmalign/align": {
+                    "diamond/makedb": {
                         "branch": "master",
-                        "git_sha": "c7cfb9446fb3098e525089198ff232d795c20ef2",
-                        "installed_by": ["modules"]
+                        "git_sha": "05954dab2ff481bcb999f24455da29a5828af08d",
+                        "installed_by": [
+                            "modules"
+                        ]
                     },
                     "multiqc": {
                         "branch": "master",
                         "git_sha": "f0719ae309075ae4a291533883847c3f7c441dad",
-                        "installed_by": ["modules"]
+                        "installed_by": [
+                            "modules"
+                        ]
                     },
                     "seqkit/stats": {
                         "branch": "master",
-                        "git_sha": "81880787133db07d9b4c1febd152c090eb8325dc",
+                        "git_sha": "81880787133db07d9b4c1febd152c090eb8325dc
                         "installed_by": ["modules"]
                     },
                     "untar": {
@@ -37,20 +43,26 @@
                     "utils_nextflow_pipeline": {
                         "branch": "master",
                         "git_sha": "c2b22d85f30a706a3073387f30380704fcae013b",
-                        "installed_by": ["subworkflows"]
+                        "installed_by": [
+                            "subworkflows"
+                        ]
                     },
                     "utils_nfcore_pipeline": {
                         "branch": "master",
                         "git_sha": "51ae5406a030d4da1e49e4dab49756844fdd6c7a",
-                        "installed_by": ["subworkflows"]
+                        "installed_by": [
+                            "subworkflows"
+                        ]
                     },
                     "utils_nfschema_plugin": {
                         "branch": "master",
                         "git_sha": "2fd2cd6d0e7b273747f32e465fdc6bcc3ae0814e",
-                        "installed_by": ["subworkflows"]
+                        "installed_by": [
+                            "subworkflows"
+                        ]
                     }
                 }
             }
         }
     }
-}
+}
diff --git a/modules/local/diamondpreparetaxa/environment.yml b/modules/local/diamondpreparetaxa/environment.yml
@@ -0,0 +1,10 @@
+---
+# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json
+channels:
+  - conda-forge
+  - bioconda
+dependencies:
+  # TODO nf-core: List required Conda package(s).
+  #               Software MUST be pinned to channel (i.e. "bioconda"), version (i.e. "1.10").
+  #               For Conda, the build (i.e. "h9402c20_2") must be EXCLUDED to support installation on different operating systems.
+  - "YOUR-TOOL-HERE"
diff --git a/modules/local/diamondpreparetaxa/main.nf b/modules/local/diamondpreparetaxa/main.nf
@@ -0,0 +1,57 @@
+process DIAMONDPREPARETAXA {
+
+    // tag "${taxondmp_zip.baseName}"
+    label 'process_low'
+
+    conda "${moduleDir}/environment.yml"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/YOUR-TOOL-HERE':
+        'biocontainers/YOUR-TOOL-HERE' }"
+
+    // write the output files to a user specified directory via an input parameter
+    // publishDir "${params.outdir}/ncbi_refseq/", mode: 'copy'
+
+    input:
+    val taxondmp_zip // Add default of ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz
+
+    output:
+    path("taxa/nodes.dmp"), emit: taxonnodes
+    path("taxa/names.dmp"), emit: taxonnames
+    path "versions.yml"           , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args = task.ext.args ?: ''
+    // def prefix = task.ext.prefix ?: "${meta.id}"
+    // Omitting from script portion for now
+        // # $args \\
+        // # -@ $task.cpus \\
+        // # -o ${prefix}.bam \\
+
+    """ 
+    mkdir -p taxa/
+    wget -q ${taxondmp_zip}
+    tar -xzf taxdump.tar.gz -C taxa
+
+        cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        diamondpreparetaxa: \$(diamondpreparetaxa --version)
+    END_VERSIONS
+    """
+
+    stub:
+    // def args = task.ext.args ?: ''
+    // def prefix = task.ext.prefix ?: "${meta.id}"
+    """
+
+    touch taxa/nodes.dmp
+    touch taxa/names.dmp
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        diamondpreparetaxa: \$(diamondpreparetaxa --version)
+    END_VERSIONS
+    """
+}
diff --git a/modules/local/diamondpreparetaxa/meta.yml b/modules/local/diamondpreparetaxa/meta.yml
@@ -0,0 +1,68 @@
+---
+# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json
+name: "diamondpreparetaxa"
+## TODO nf-core: Add a description of the module and list keywords
+description: write your description here
+keywords:
+  - sort
+  - example
+  - genomics
+tools:
+  - "diamondpreparetaxa":
+      ## TODO nf-core: Add a description and other details for the software below
+      description: ""
+      homepage: ""
+      documentation: ""
+      tool_dev_url: ""
+      doi: ""
+      licence:
+      identifier:
+
+## TODO nf-core: Add a description of all of the variables used as input
+input:
+  # Only when we have meta
+  - - meta:
+        type: map
+        description: |
+          Groovy Map containing sample information
+          e.g. `[ id:'sample1' ]`
+
+    ## TODO nf-core: Delete / customise this example input
+    - bam:
+        type: file
+        description: Sorted BAM/CRAM/SAM file
+        pattern: "*.{bam,cram,sam}"
+        ontologies:
+          - edam: "http://edamontology.org/format_2572" # BAM
+          - edam: "http://edamontology.org/format_2573" # CRAM
+          - edam: "http://edamontology.org/format_3462" # SAM
+
+## TODO nf-core: Add a description of all of the variables used as output
+output:
+  - bam:
+      #Only when we have meta
+      - meta:
+          type: map
+          description: |
+            Groovy Map containing sample information
+            e.g. `[ id:'sample1' ]`
+      ## TODO nf-core: Delete / customise this example output
+      - "*.bam":
+          type: file
+          description: Sorted BAM/CRAM/SAM file
+          pattern: "*.{bam,cram,sam}"
+          ontologies:
+            - edam: "http://edamontology.org/format_2572" # BAM
+            - edam: "http://edamontology.org/format_2573" # CRAM
+            - edam: "http://edamontology.org/format_3462" # SAM
+
+  - versions:
+      - "versions.yml":
+          type: file
+          description: File containing software versions
+          pattern: "versions.yml"
+
+authors:
+  - "@tracelail"
+maintainers:
+  - "@tracelail"