snakemake-workflows · rabioinf · Nov 29, 2024 · Nov 29, 2024 · Nov 29, 2024 · Nov 30, 2024
diff --git a/.github/workflows/conventional-prs.yml b/.github/workflows/conventional-prs.yml
@@ -1,4 +1,4 @@
-name: PR
+name: Lint PR
 on:
   pull_request_target:
     types:
@@ -7,12 +7,14 @@ on:
       - edited
       - synchronize
 
+permissions:
+  pull-requests: read
+
 jobs:
-  title-format:
+  main:
+    name: Validate PR title
     runs-on: ubuntu-latest
     steps:
-      - uses: amannn/action-semantic-pull-request@v3.4.0
+      - uses: amannn/action-semantic-pull-request@v5
         env:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        with:
-          validateSingleCommit: true
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -2,18 +2,20 @@ name: Tests
 
 on:
   push:
-    branches: [ main ]
+    branches: [main, dev]
   pull_request:
-    branches: [ main ]
-
+    branches: [main, dev]
 
 jobs:
   Formatting:
     runs-on: ubuntu-latest
+    if: ${{ github.actor != 'github-actions[bot]' }}
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
       - name: Formatting
-        uses: github/super-linter@v4
+        uses: super-linter/super-linter@v7
         env:
           VALIDATE_ALL_CODEBASE: false
           DEFAULT_BRANCH: main
@@ -22,33 +24,35 @@ jobs:
 
   Linting:
     runs-on: ubuntu-latest
+    if: ${{ github.actor != 'github-actions[bot]' }}
     steps:
-    - uses: actions/checkout@v2
-    - name: Lint workflow
-      uses: snakemake/snakemake-github-action@v1.24.0
-      with:
-        directory: .
-        snakefile: workflow/Snakefile
-        args: "--lint"
+      - uses: actions/checkout@v2
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
+      - name: Lint workflow
+        uses: snakemake/snakemake-github-action@v1.25.1
+        with:
+          directory: .
+          snakefile: workflow/Snakefile
+          args: "--lint"
 
   Testing:
     runs-on: ubuntu-latest
-    needs: 
+    if: ${{ github.actor != 'github-actions[bot]' }}
+    needs:
       - Linting
       - Formatting
     steps:
-    - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
 
-    - name: Test workflow
-      uses: snakemake/snakemake-github-action@v1.24.0
-      with:
-        directory: .test
-        snakefile: workflow/Snakefile
-        args: "--use-conda --show-failed-logs --cores 3 --conda-cleanup-pkgs cache --all-temp"
+      - name: Test workflow
+        uses: snakemake/snakemake-github-action@v1.25.1
+        with:
+          directory: .test
+          snakefile: workflow/Snakefile
+          args: "--use-conda --show-failed-logs --cores 3 --conda-cleanup-pkgs cache"
 
-    - name: Test report
-      uses: snakemake/snakemake-github-action@v1.24.0
-      with:
-        directory: .test
-        snakefile: workflow/Snakefile
-        args: "--report report.zip"
+      - name: Test report
+        uses: snakemake/snakemake-github-action@v1.25.1
+        with:
+          directory: .test
+          snakefile: workflow/Snakefile
+          args: "--report report.zip"
diff --git a/.github/workflows/release-please.yml b/.github/workflows/release-please.yml
@@ -9,9 +9,8 @@ jobs:
   release-please:
     runs-on: ubuntu-latest
     steps:
-
       - uses: GoogleCloudPlatform/release-please-action@v2
         id: release
         with:
           release-type: go # just keep a changelog, no version anywhere outside of git tags
-          package-name: <repo>
+          package-name: <repo>
diff --git a/.gitignore b/.gitignore
@@ -3,3 +3,4 @@ resources/**
 logs/**
 .snakemake
 .snakemake/**
+.test/results/*
diff --git a/.test/config/config.yml b/.test/config/config.yml
@@ -0,0 +1,27 @@
+samplesheet: "config/samples.tsv"
+
+get_genome:
+  database: "ncbi"
+  assembly: "GCF_000006785.2"
+  fasta: Null
+  gff: Null
+  gff_source_type:
+    [
+      "RefSeq": "gene",
+      "RefSeq": "pseudogene",
+      "RefSeq": "CDS",
+      "Protein Homology": "CDS",
+    ]
-  gff_source_type:
-    [
-      "RefSeq": "gene",
-      "RefSeq": "pseudogene",
-      "RefSeq": "CDS",
-      "Protein Homology": "CDS",
-    ]
+  gff_source_type:
+    [
+      {"source": "RefSeq", "type": "gene"},
+      {"source": "RefSeq", "type": "pseudogene"},
+      {"source": "RefSeq", "type": "CDS"},
+      "Protein Homology": "CDS",
+    ]
-  gff_source_type:
-    [
-      "RefSeq": "gene",
-      "RefSeq": "pseudogene",
-      "RefSeq": "CDS",
-      "Protein Homology": "CDS",
-    ]
+  gff_source_type:
+    [
+      {"source": "RefSeq", "type": "gene"},
+      {"source": "RefSeq", "type": "pseudogene"},
+      {"source": "RefSeq", "type": "CDS"},
+      "Protein Homology": "CDS",
+    ]
+
+simulate_reads:
+  read_length: 100
+  read_number: 100000
+  random_freq: 0.01
+
+cutadapt:
+  threep_adapter: "-a ATCGTAGATCGG"
+  fivep_adapter: "-A GATGGCGATAGG"
+  default: ["-q 10 ", "-m 25 ", "-M 100", "--overlap=5"]
+
+multiqc:
+  config: "config/multiqc_config.yml"
diff --git a/.test/config/multiqc_config.yml b/.test/config/multiqc_config.yml
@@ -0,0 +1,2 @@
+remove_sections:
+  - samtools-stats
diff --git a/.test/config/samples.tsv b/.test/config/samples.tsv
@@ -0,0 +1,3 @@
+sample	condition	replicate	read1	read2
+sample1	wild_type	1	sample1.bwa.read1.fastq.gz	sample1.bwa.read2.fastq.gz
+sample2	wild_type	2	sample2.bwa.read1.fastq.gz	sample2.bwa.read2.fastq.gz
diff --git a/README.md b/README.md
@@ -1,21 +1,122 @@
 # Snakemake workflow: `<name>`
 
-[![Snakemake](https://img.shields.io/badge/snakemake-≥6.3.0-brightgreen.svg)](https://snakemake.github.io)
-[![GitHub actions status](https://github.com/<owner>/<repo>/workflows/Tests/badge.svg?branch=main)](https://github.com/<owner>/<repo>/actions?query=branch%3Amain+workflow%3ATests)
-
+[![Snakemake](https://img.shields.io/badge/snakemake-≥8.0.0-brightgreen.svg)](https://snakemake.github.io)
+[![GitHub actions status](https://github.com/MPUSP/snakemake-workflow-template/actions/workflows/main.yml/badge.svg?branch=main)](https://github.com/MPUSP/snakemake-workflow-template/actions/workflows/main.yml)
+[![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda)](https://docs.conda.io/en/latest/)
+[![run with singularity](https://img.shields.io/badge/run%20with-singularity-1D355C.svg?labelColor=000000)](https://sylabs.io/docs/)
+[![workflow catalog](https://img.shields.io/badge/Snakemake%20workflow%20catalog-darkgreen)](https://snakemake.github.io/snakemake-workflow-catalog)
 
 A Snakemake workflow for `<description>`
 
+- [Snakemake workflow: `<name>`](#snakemake-workflow-name)
+  - [Usage](#usage)
+  - [Workflow overview](#workflow-overview)
+  - [Running the workflow](#running-the-workflow)
+    - [Input data](#input-data)
+    - [Execution](#execution)
+    - [Parameters](#parameters)
+  - [Authors](#authors)
+  - [References](#references)
+  - [TODO](#todo)
 
 ## Usage
 
 The usage of this workflow is described in the [Snakemake Workflow Catalog](https://snakemake.github.io/snakemake-workflow-catalog/?usage=<owner>%2F<repo>).
 
-If you use this workflow in a paper, don't forget to give credits to the authors by citing the URL of this (original) <repo>sitory and its DOI (see above).
+If you use this workflow in a paper, don't forget to give credits to the authors by citing the URL of this repository or its DOI.
+
+## Workflow overview
+
+This workflow is a best-practice workflow for `<detailed description>`.
+The workflow is built using [snakemake](https://snakemake.readthedocs.io/en/stable/) and consists of the following steps:
+
+1. Parse sample sheet containing sample meta data (`python`)
+2. Simulate short read sequencing data on the fly (`dwgsim`)
+3. Check quality of input read data (`FastQC`)
+4. Trim adapters from input data (`cutadapt`)
+5. Collect statistics from tool output (`MultiQC`)
+
+## Running the workflow
+
+### Input data
+
+This template workflow creates artificial sequencing data in `*.fastq.gz` format. It does not contain actual input data. The simulated input files are nevertheless created based on a mandatory table linked in the `config.yml` file (default: `.test/samples.tsv`). The sample sheet has the following layout:
+
+| sample  | condition | replicate | read1                      | read2                      |
+| ------- | --------- | --------- | -------------------------- | -------------------------- |
+| sample1 | wild_type | 1         | sample1.bwa.read1.fastq.gz | sample1.bwa.read2.fastq.gz |
+| sample2 | wild_type | 2         | sample2.bwa.read1.fastq.gz | sample2.bwa.read2.fastq.gz |
+
+
+### Execution
+
+To run the workflow from command line, change the working directory.
+
+```bash
+cd path/to/snakemake-workflow-name
+```
+
+Adjust options in the default config file `config/config.yml`.
+Before running the entire workflow, you can perform a dry run using:
+
+```bash
+snakemake --dry-run
+```
+
+To run the complete workflow with test files using **conda**, execute the following command. The definition of the number of compute cores is mandatory.
+
+```bash
+snakemake --cores 3 --sdm conda --directory .test
+```
+
+To run the workflow with **singularity** / **apptainer**, add a link to a container registry in the `Snakefile`, for example:
+`container: "oras://ghcr.io/<user>/<repository>:<version>"` for Github's container registry. Run the workflow with:
+
+```bash
+snakemake --cores 3 --sdm conda apptainer --directory .test
+```
+
+### Parameters
+
+This table lists all parameters that can be used to run the workflow.
+
+| parameter          | type | details                                 | default                                       |
+| ------------------ | ---- | --------------------------------------- | --------------------------------------------- |
+| **samplesheet**    |      |                                         |                                               |
+| path               | str  | path to samplesheet, mandatory          | "config/samples.tsv"                          |
+| **get_genome**     |      |                                         |                                               |
+| database           | str  | one of `manual`, `ncbi`                 | `ncbi`                                        |
+| assembly           | str  | RefSeq ID                               | `GCF_000006785.2`                             |
+| fasta              | str  | optional path to fasta file             | Null                                          |
+| gff                | str  | optional path to gff file               | Null                                          |
+| gff_source_type    | str  | list of name/value pairs for GFF source | see config file                               |
+| **simulate_reads** |      |                                         |                                               |
+| read_length        | num  | length of target reads in bp            | 100                                           |
+| read_number        | num  | number of total reads to be simulated   | 100000                                        |
+| random_freq        | num  | frequency of random read sequences      | 0.01                                          |
+| **cutadapt**       |      |                                         |                                               |
+| threep_adapter     | str  | sequence of the 3' adapter              | `-a ATCGTAGATCGG`                             |
+| fivep_adapter      | str  | sequence of the 5' adapter              | `-A GATGGCGATAGG`                             |
+| default            | str  | additional options passed to `cutadapt` | [`-q 10 `, `-m 25 `, `-M 100`, `--overlap=5`] |
+| **multiqc**        |      |                                         |                                               |
+| config             | str  | path to multiQC config                  | `config/multiqc_config.yml`                   |
+
+## Authors
+
+- Firstname Lastname
+  - Affiliation
+  - ORCID profile
+  - home page
+
+## References
+
+> Köster, J., Mölder, F., Jablonski, K. P., Letcher, B., Hall, M. B., Tomkins-Tinch, C. H., Sochat, V., Forster, J., Lee, S., Twardziok, S. O., Kanitz, A., Wilm, A., Holtgrewe, M., Rahmann, S., & Nahnsen, S. *Sustainable data analysis with Snakemake*. F1000Research, 10:33, 10, 33, **2021**. https://doi.org/10.12688/f1000research.29032.2.
 
-# TODO
+## TODO
 
 * Replace `<owner>` and `<repo>` everywhere in the template (also under .github/workflows) with the correct `<repo>` name and owning user or organization.
 * Replace `<name>` with the workflow name (can be the same as `<repo>`).
 * Replace `<description>` with a description of what the workflow does.
+* Update the workflow description, parameters, running options, authors and references in the `README.md`
+* Update the `README.md` badges. Add or remove badges for `conda`/`singularity`/`apptainer` usage depending on the workflow's capability
 * The workflow will occur in the snakemake-workflow-catalog once it has been made public. Then the link under "Usage" will point to the usage instructions if `<owner>` and `<repo>` were correctly set.
diff --git a/config/README.md b/config/README.md
@@ -1,2 +1,82 @@
-Describe how to configure the workflow (using config.yaml and maybe additional files).
-All of them need to be present with example entries inside of the config folder.
+## Workflow overview
+
+This workflow is a best-practice workflow for `<detailed description>`.
+The workflow is built using [snakemake](https://snakemake.readthedocs.io/en/stable/) and consists of the following steps:
+
+1. Parse sample sheet containing sample meta data (`python`)
+2. Simulate short read sequencing data on the fly (`dwgsim`)
+3. Check quality of input read data (`FastQC`)
+4. Trim adapters from input data (`cutadapt`)
+5. Collect statistics from tool output (`MultiQC`)
+
+## Running the workflow
+
+### Input data
+
+This template workflow creates artificial sequencing data in `*.fastq.gz` format. It does not contain actual input data. The simulated input files are nevertheless created based on a mandatory table linked in the `config.yml` file (default: `.test/samples.tsv`). The sample sheet has the following layout:
+
+| sample  | condition | replicate | read1                      | read2                      |
+| ------- | --------- | --------- | -------------------------- | -------------------------- |
+| sample1 | wild_type | 1         | sample1.bwa.read1.fastq.gz | sample1.bwa.read2.fastq.gz |
+| sample2 | wild_type | 2         | sample2.bwa.read1.fastq.gz | sample2.bwa.read2.fastq.gz |
+
+
+### Execution
+
+To run the workflow from command line, change the working directory.
+
+```bash
+cd path/to/snakemake-workflow-name
+```
+
+Adjust options in the default config file `config/config.yml`.
+Before running the entire workflow, you can perform a dry run using:
+
+```bash
+snakemake --dry-run
+```
+
+To run the complete workflow with test files using **conda**, execute the following command. The definition of the number of compute cores is mandatory.
+
+```bash
+snakemake --cores 3 --sdm conda --directory .test
+```
+
+To run the workflow with **singularity** / **apptainer**, add a link to a container registry in the `Snakefile`, for example:
+`container: "oras://ghcr.io/<user>/<repository>:<version>"` for Github's container registry. Run the workflow with:
+
+```bash
+snakemake --cores 3 --sdm conda apptainer --directory .test
+```
+
+### Parameters
+
+This table lists all parameters that can be used to run the workflow.
+
+| parameter          | type | details                                 | default                                       |
+| ------------------ | ---- | --------------------------------------- | --------------------------------------------- |
+| **samplesheet**    |      |                                         |                                               |
+| path               | str  | path to samplesheet, mandatory          | "config/samples.tsv"                          |
+| **get_genome**     |      |                                         |                                               |
+| database           | str  | one of `manual`, `ncbi`                 | `ncbi`                                        |
+| assembly           | str  | RefSeq ID                               | `GCF_000006785.2`                             |
+| fasta              | str  | optional path to fasta file             | Null                                          |
+| gff                | str  | optional path to gff file               | Null                                          |
+| gff_source_type    | str  | list of name/value pairs for GFF source | see config file                               |
+| **simulate_reads** |      |                                         |                                               |
+| read_length        | num  | length of target reads in bp            | 100                                           |
+| read_number        | num  | number of total reads to be simulated   | 100000                                        |
+| random_freq        | num  | frequency of random read sequences      | 0.01                                          |
+| **cutadapt**       |      |                                         |                                               |
+| threep_adapter     | str  | sequence of the 3' adapter              | `-a ATCGTAGATCGG`                             |
+| fivep_adapter      | str  | sequence of the 5' adapter              | `-A GATGGCGATAGG`                             |
+| default            | str  | additional options passed to `cutadapt` | [`-q 10 `, `-m 25 `, `-M 100`, `--overlap=5`] |
+| **multiqc**        |      |                                         |                                               |
+| config             | str  | path to multiQC config                  | `config/multiqc_config.yml`                   |
+
+## TODO
+
+* Replace `<owner>` and `<repo>` everywhere in the template (also under .github/workflows) with the correct `<repo>` name and owning user or organization.
+* Replace `<name>` with the workflow name (can be the same as `<repo>`).
+* Replace `<description>` with a description of what the workflow does.
+* Update the workflow parameters and running options