emo-bon
diff --git a/‎.github/workflows/conda.yml‎
Lines changed: 12 additions & 2 deletions b/‎.github/workflows/conda.yml‎
Lines changed: 12 additions & 2 deletions
diff --git a/‎.gitignore‎
Lines changed: 7 additions & 0 deletions b/‎.gitignore‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎Installation/download_dbs.sh‎
Lines changed: 3 additions & 3 deletions b/‎Installation/download_dbs.sh‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎README.md‎
Lines changed: 119 additions & 32 deletions b/‎README.md‎
Lines changed: 119 additions & 32 deletions
diff --git a/‎RunTimeResearchObject-f0b553d37e4255a3291393948f3e308bd88ed301.zip‎
-138 KB b/‎RunTimeResearchObject-f0b553d37e4255a3291393948f3e308bd88ed301.zip‎
-138 KB
diff --git a/‎config.yml‎
Lines changed: 15 additions & 12 deletions b/‎config.yml‎
Lines changed: 15 additions & 12 deletions
diff --git a/‎dependencies.md‎
Lines changed: 0 additions & 56 deletions b/‎dependencies.md‎
Lines changed: 0 additions & 56 deletions
@@ -3,14 +3,24 @@ on: [push]
 jobs:
   cwl_tests:
     name: Run cwl_tests.sh
-    runs-on: ubuntu-18.04
+    runs-on: ubuntu-20.04
     defaults:
       run:
         shell: bash -l {0}
     steps:
+      - name: Install dependencies
+        run: |
+          sudo apt-get install -y python3-pip
+          pip install cwltool lockfile
+
       - uses: actions/checkout@v2
       - run: | 
-          ls 
+          ls
+
+      - name: Validate workflow
+        run: |
+          cwltool --validate workflows/gos_wf.cwl
+
       # - uses: conda-incubator/setup-miniconda@v2
       #   with:
       #     activate-environment: anaconda-client-env
 
@@ -116,8 +116,15 @@ venv.bak/
 
 # Ignore real-world test samples
 test_input/SRR*
+test_input/DB*
 
 # Ignore dev output 
 TEST_*/
 *.output
 
+STELIOS_TEST/
+marine_sediment_dbh/
+
+slurm_run.sh
+
+
@@ -125,6 +125,7 @@ wget $FTP_DBS/kofam_ko_desc.tsv
 echo 'Download eggnog dbs'
 wget http://eggnog5.embl.de/download/emapperdb-5.0.2/eggnog_proteins.dmnd.gz
 wget http://eggnog5.embl.de/download/emapperdb-5.0.2/eggnog.db.gz
+gunzip eggnog.db.gz eggnog_proteins.dmnd.gz
 mkdir eggnog && mv eggnog_proteins.dmnd eggnog.db eggnog
 
 # Diamond
@@ -139,6 +140,5 @@ echo 'Download pathways data'
 wget ftp://ftp.ebi.ac.uk/pub/databases/metagenomics/pipeline-5.0/ref-dbs/graphs-20200805.pkl.gz \
     ftp://ftp.ebi.ac.uk/pub/databases/metagenomics/pipeline-5.0/ref-dbs/all_pathways_class.txt.gz \
     ftp://ftp.ebi.ac.uk/pub/databases/metagenomics/pipeline-5.0/ref-dbs/all_pathways_names.txt.gz
-gunzip graphs.pkl.gz all_pathways_class.txt.gz all_pathways_names.txt.gz
-mkdir kegg_pathways && mv graphs.pkl all_pathways_class.txt all_pathways_names.txt kegg_pathways
-
+gunzip graphs-20200805.pkl.gz all_pathways_class.txt.gz all_pathways_names.txt.gz
+mkdir kegg_pathways && mv graphs-20200805.pkl all_pathways_class.txt all_pathways_names.txt kegg_pathways
@@ -1,79 +1,166 @@
-# metaGOflow: A workflow for marine Genomic Observatories data analysis
+# metaGOflow: A workflow for marine Genomic Observatories' data analysis
 
-## An EOSC-Life project
+![logo](https://raw.githubusercontent.com/hariszaf/metaGOflow-use-case/gh-pages/assets/img/metaGOflow_logo_italics.png)
 
-[![Build Status](https://travis-ci.org/EBI-Metagenomics/pipeline-v5.svg?branch=master)](https://travis-ci.com/EBI-Metagenomics/pipeline-v5)
 
-The workflows developed in the framework of this project are based on `pipeline-v5` of the MGnify resource. 
+## An EOSC-Life project
 
-> This branch is a child of the [`pipeline_5.1`](https://github.com/hariszaf/pipeline-v5/tree/pipeline_5.1) branch 
-that contains all CWL descriptions of the MGnify pipeline version 5.1.
+The workflows developed in the framework of this project are based on `pipeline-v5` of the MGnify resource.
 
+> This branch is a child of the [`pipeline_5.1`](https://github.com/hariszaf/pipeline-v5/tree/pipeline_5.1) branch
+> that contains all CWL descriptions of the MGnify pipeline version 5.1.
 
 ## Dependencies
 
-- python3 [v 3.7+]
-- [Docker](https://www.docker.com) [v 19.+] or [Singularity](https://apptainer.org)
-- [cwltool](https://github.com/common-workflow-language/cwltool) [v 3.+] 
+To run metaGOflow you need to make sure you have the following set on your computing environmnet first:
+
+- python3 [v 3.8+]
+- [Docker](https://www.docker.com) [v 19.+] or [Singularity](https://apptainer.org) [v 3.7.+]/[Apptainer](https://apptainer.org) [v 1.+]
+- [cwltool](https://github.com/common-workflow-language/cwltool) [v 3.+]
+- [rdflib](https://rdflib.readthedocs.io/en/stable/) [v 6.+]
+- [rdflib-jsonld](https://pypi.org/project/rdflib-jsonld/) [v 0.6.2]
+- [ro-crate-py](https://github.com/ResearchObject/ro-crate-py) [v 0.7.0]
+- [pyyaml](https://pypi.org/project/PyYAML/) [v 6.0]
+- [Node.js](https://nodejs.org/) [v 10.24.0+]
+- Available storage ~235GB for databases
 
-Depending on the analysis you are about to run, disk requirements vary. 
+### Storage while running
+
+Depending on the analysis you are about to run, disk requirements vary.
 Indicatively, you may have a look at the metaGOflow publication for computing resources used in various cases.
 
+## Installation
 
 ### Get the EOSC-Life marine GOs workflow
 
 ```bash
-git clone https://github.com/emo-bon/pipeline-v5.git
-cd pipeline-v5
+git clone https://github.com/emo-bon/MetaGOflow
+cd MetaGOflow
 ```
 
-
-### Download necessary databases
+### Download necessary databases (~235GB)
 
 You can download databases for the EOSC-Life GOs workflow by running the
 `download_dbs.sh` script under the `Installation` folder.
 
-If you have one or more already in your system, then create a symbolic link pointing 
-at the `ref-dbs` folder. 
+```bash
+bash Installation/download_dbs.sh -f [Output Directory e.g. ref-dbs] 
+```
+If you have one or more already in your system, then create a symbolic link pointing
+at the `ref-dbs` folder or at one of its subfolders/files.
+
+The final structure of the DB directory should be like the following:
 
+````bash
+user@server:~/MetaGOflow: ls ref-dbs/
+db_kofam/  diamond/  eggnog/  GO-slim/  interproscan-5.57-90.0/  kegg_pathways/  kofam_ko_desc.tsv  Rfam/  silva_lsu/  silva_ssu/
+````
 
 ## How to run
 
+### Ensure that `Node.js` is installed on your system before running metaGOflow
+
+If you have root access on your system, you can run the commands below to install it:
+
+##### DEBIAN/UBUNTU
+```bash
+sudo apt-get update -y
+sudo apt-get install -y nodejs
+```
+
+##### RH/CentOS
+```bash
+sudo yum install rh-nodejs<stream version> (e.g. rh-nodejs10)
+```
 
-- Edit the `config.yml` file to set the parameter values of your choice.
+### Set up the environment
 
-- Make a job file (e.g., SBATCH file) and 
+#### Run once - Setup environment
 
-   - enable Singularity, e.g. `module load Singularity`
+- ```bash
+  conda create -n EOSC-CWL python=3.8
+  ```
+
+- ```bash
+  conda activate EOSC-CWL
+  ```
+
+- ```bash
+  pip install cwlref-runner cwltool[all] rdflib-jsonld rocrate pyyaml
+
+  ```
+
+#### Run every time
+
+```bash
+conda activate EOSC-CWL
+``` 
+
+### Run the workflow
+
+- Edit the `config.yml` file to set the parameter values of your choice. For selecting all the steps, then set to `true` the variables in lines [2-6].
+
+#### Using Singularity
+
+##### Standalone
+- run:
+   ```bash
+   ./run_wf.sh -s -n osd-short -d short-test-case -f test_input/wgs-paired-SRR1620013_1.fastq.gz -r test_input/wgs-paired-SRR1620013_2.fastq.gz
+   ``
+
+##### Using a cluster with a queueing system (e.g. SLURM)
+
+- Create a job file (e.g., SBATCH file)
+
+- Enable Singularity, e.g. module load Singularity & all other dependencies 
+
+- Add the run line to the job file
+
+
+#### Using Docker
+
+##### Standalone
+- run:
+    ``` bash
+    ./run_wf.sh -n osd-short -d short-test-case -f test_input/wgs-paired-SRR1620013_1.fastq.gz -r test_input/wgs-paired-SRR1620013_2.fastq.gz
+  ```
+  HINT: If you are using Docker, you may need to run the above command without the `-s' flag.
+
+## Testing samples
+The samples are available in the `test_input` folder.
+
+We provide metaGOflow with partial samples from the Human Metagenome Project ([SRR1620013](https://www.ebi.ac.uk/ena/browser/view/SRR1620013) and [SRR1620014](https://www.ebi.ac.uk/ena/browser/view/SRR1620014))
+They are partial as only a small part of their sequences have been kept, in terms for the pipeline to test in a fast way. 
 
-    - run: 
-        ```
-        ./run_wf.sh -n false -n osd-short -d short-test-case -f test_input/wgs-paired-SRR1620013_1.fastq.gz -r test_input/wgs-paired-SRR1620013_2.fastq.gz
-        ```
 
 ## Hints and tips
 
 1. In case you are using Docker, it is strongly recommended to **avoid** installing it through `snap`.
 
-2. `RuntimeError`: slurm currently does not support shared caching, because it does not support cleaning up a worker after the last job finishes. 
-Set the `--disableCaching` flag if you want to use this batch system.
+2. `RuntimeError`: slurm currently does not support shared caching, because it does not support cleaning up a worker
+   after the last job finishes.
+   Set the `--disableCaching` flag if you want to use this batch system.
+
+3. In case you are having errors like:
 
-3. In case you are having errors like: 
 ```
-wltool.errors.WorkflowException: Singularity is not available for this tool
+cwltool.errors.WorkflowException: Singularity is not available for this tool
 ```
+
 You may run the following command:
+
 ```
 singularity pull --force --name debian:stable-slim.sif docker://debian:stable-sli
 ```
 
-
 ## Contribution
 
-To make contribution to the project a bit easier, all the MGnify `conditionals` and `subworkflows` under the `workflows/` directory that are not used in the metaGOflow framework, have been removed.   
-However, all the MGnify `tools/` and `utils/` are available in this repo, even if they are not invoked in the current version of metaGOflow. 
-This way, we hope we encourage people to implement their own `conditionals` and/or `subworkflows` by exploiting the currently supported `tools` and `utils` as well as by developing new `tools` and/or `utils`. 
-
+To make contribution to the project a bit easier, all the MGnify `conditionals` and `subworkflows` under
+the `workflows/` directory that are not used in the metaGOflow framework, have been removed.   
+However, all the MGnify `tools/` and `utils/` are available in this repo, even if they are not invoked in the current
+version of metaGOflow.
+This way, we hope we encourage people to implement their own `conditionals` and/or `subworkflows` by exploiting the
+currently supported `tools` and `utils` as well as by developing new `tools` and/or `utils`.
 
 
 <!-- cwltool --print-dot my-wf.cwl | dot -Tsvg > my-wf.svg -->
@@ -1,12 +1,15 @@
 # Steps to go for
 qc_and_merge_step: true
-taxonomic_inventory: false
-cgc_step: false
-reads_functional_annotation: false
+taxonomic_inventory: true
+cgc_step: true
+reads_functional_annotation: true
 assemble: false
 
 # Global
-threads: 20
+threads: 40
+
+# As a rule of thumb keep that as floor(threads/8) where threads the previous parameter
+interproscan_threads: 4
 
 # fastp parameters
 detect_adapter_for_pe: false
@@ -28,8 +31,8 @@ min-contig-len: 200
 # Combined Gene Caller // the size is in MB
 cgc_chunk_size: 200
 
-# Taxonomic inference using Diamond and the contigs
-diamond_maxTargetSeqs: 1
+# # Taxonomic inference using Diamond and the contigs
+# diamond_maxTargetSeqs: 1
 
 # Functional annotation
 protein_chunk_size_IPS: 2000000
@@ -57,21 +60,21 @@ protein_chunk_size_hmm: 50000
 processed_reads: {
   class: File, 
   format: "edam:format_1929",
-  path:  /home1/gmoro/pipeline-v5/test_input/pseudo.merged.fasta
+  path:  workflows/pseudo_files/pseudo.merged.fasta
 }
 
 # Mandatory for running the taxonomy inventory step
 input_for_motus: {
   class: File, 
-  path:  /home1/gmoro/pipeline-v5/test_input/pseudo.merged.unfiltered.fasta
+  path:  workflows/pseudo_files/pseudo.merged.unfiltered.fasta
 }
 
 
 # Mandatory for running the functional annotation steps
 # If produced previously from metaGOflow, will have a suffix like: .cmsearch.all.tblout.deoverlapped 
 maskfile: {
   class: File, 
-  path:  /home1/gmoro/pipeline-v5/test_input/pseudo.merged.cmsearch.all.tblout.deoverlapped
+  path:  workflows/pseudo_files/pseudo.merged.cmsearch.all.tblout.deoverlapped
 }
 
 # Mandatory for the functional annotation step 
@@ -84,13 +87,13 @@ count_faa_from_previous_run:
 predicted_faa_from_previous_run: {
   class: File, 
   format: "edam:format_1929",
-  path:  /home1/gmoro/pipeline-v5/test_input/pseudo.merged_CDS.faa
+  path:  workflows/pseudo_files/pseudo.merged_CDS.faa
 }
 
 # Mandatory for running the assembly step 
 processed_read_files: 
   - class: File
-    path:  /home1/gmoro/pipeline-v5/test_input/pseudo_1_clean.fastq.trimmed.fasta
+    path:  workflows/pseudo_files/pseudo_1_clean.fastq.trimmed.fasta
   - class: File
-    path:  /home1/gmoro/pipeline-v5/test_input/pseudo_2_clean.fastq.trimmed.fasta
+    path:  workflows/pseudo_files/pseudo_2_clean.fastq.trimmed.fasta