Merge pull request #8 from PROBIC/docs

tmaklin · web-flow · commit 30f573ef2e67 · 2020-03-30T10:21:50.000Z
Documentation &amp; input filter
diff --git a/README.md b/README.md
@@ -40,7 +40,12 @@ mGEMS bin and mGEMS extract, which bin the reads in the input
 pseudoalignment (mGEMS bin) and extract the binned reads from the
 original mixed samples (mGEMS extract).
 
-### (Pseudo)tutorial — Full pipeline with Themisto and mSWEEP
+### Tutorial — E. coli ST131 sublineages
+A tutorial for reproducing the *E. coli* ST131 sublineage phylogenetic
+tree presented in Mäklin et al. 2020 using mGEMS is available in the
+[docs folder of this repository](docs/TUTORIAL.md).
+
+### Quickstart — full pipeline
 Build a [Themisto](https://github.com/algbio/themisto) index to
 align against.
 ```
@@ -49,23 +54,23 @@ mkdir themisto_index/tmp
 build_index --k 31 --input-file example.fasta --auto-colors --index-dir themisto_index --temp-dir themisto_index/tmp
 ```
 
-Align paired-end reads 'reads_1.fastq.gz' and 'reads_2.fastq.gz' with Themisto
+Align paired-end reads 'reads_1.fastq.gz' and 'reads_2.fastq.gz' with Themisto (note the **--sort-output** flag must be used!)
 ```
-pseudoalign --index-dir themisto_index --query-file reads_1.fastq.gz --outfile pseudoalignments_1.aln --rc --temp-dir themisto_index/tmp --n-threads 16 --mem-megas 8192 --sort-output
-pseudoalign --index-dir themisto_index --query-file reads_2.fastq.gz --outfile pseudoalignments_2.aln --rc --temp-dir themisto_index/tmp --n-threads 16 --mem-megas 8192 --sort-output
+pseudoalign --index-dir themisto_index --query-file reads_1.fastq.gz --outfile pseudoalignments_1.aln --rc --temp-dir themisto_index/tmp --n-threads 16 --mem-megas 8192 --sort-output --gzip-output
+pseudoalign --index-dir themisto_index --query-file reads_2.fastq.gz --outfile pseudoalignments_2.aln --rc --temp-dir themisto_index/tmp --n-threads 16 --mem-megas 8192 --sort-output --gzip-output
 ```
 
 Estimate the relative abundances with mSWEEP (reference_grouping.txt
 should contain the groups the sequences in 'example.fasta' are
 assigned to. See the [mSWEEP](https://github.com/probic/msweep-assembly) usage instructions for details).
 ```
-mSWEEP --themisto-1 pseudoalignments_1.aln --themisto-2 pseudoalignments_2.aln -o mSWEEP -i reference_grouping.txt --write-probs
+mSWEEP --themisto-1 pseudoalignments_1.aln.gz --themisto-2 pseudoalignments_2.aln.gz -o mSWEEP -i reference_grouping.txt --write-probs
 ```
 
 Bin the reads and write all bins to the 'mGEMS-out' folder
 ```
 mkdir mGEMS-out
-mGEMS -r reads_1.fastq.gz,reads_2.fastq.gz --themisto-alns pseudoalignments_1.txt,pseudoalignments_2.txt -o mGEMS-out --probs mSWEEP_probs.csv -a mSWEEP_abundances.txt --index themisto_index
+mGEMS -r reads_1.fastq.gz,reads_2.fastq.gz --themisto-alns pseudoalignments_1.aln.gz,pseudoalignments_2.aln.gz -o mGEMS-out --probs mSWEEP_probs.csv -a mSWEEP_abundances.txt --index themisto_index
 ```
 This will write the binned paired-end reads for *all groups* in the
 mSWEEP_abundances.txt file in the mGEMS-out folder (compressed with
@@ -75,13 +80,13 @@ zlib).
 ... or bin and write only the reads that are assigned to "group-3" or
 "group-4" by adding the '--groups group-3,group-4' flag
 ```
-mGEMS --groups group-3,group-4 -r reads_1.fastq.gz,reads_2.fastq.gz --themisto-alns pseudoalignments_1.txt,pseudoalignments_2.txt -o mGEMS-out --probs mSWEEP_probs.csv -a mSWEEP_abundances.txt --index themisto_index
+mGEMS --groups group-3,group-4 -r reads_1.fastq.gz,reads_2.fastq.gz --themisto-alns pseudoalignments_1.aln.gz,pseudoalignments_2.aln.gz -o mGEMS-out --probs mSWEEP_probs.csv -a mSWEEP_abundances.txt --index themisto_index
 ```
 
 Alternatively, find and write only the read bins for "group-3" and
 "group-4", skipping extracting the reads
 ```
-mGEMS bin --groups group-3,group-4 --themisto-alns pseudoalignments_1.txt,pseudoalignments_2.txt -o mGEMS-out --probs mSWEEP_probs.csv -a mSWEEP_abundances.txt --index themisto_index
+mGEMS bin --groups group-3,group-4 --themisto-alns pseudoalignments_1.aln.gz,pseudoalignments_2.aln.gz -o mGEMS-out --probs mSWEEP_probs.csv -a mSWEEP_abundances.txt --index themisto_index
 ```
 
 ... and extract the reads when feeling like it
@@ -102,9 +107,9 @@ mGEMS accepts the following input flags
 	-a                 Relative abundance estimates from mSWEEP (tab-separated, 1st
 	                   column has the group names and 2nd column the estimates).
 	--index            Themisto pseudoalignment index directory.
-	--groups           (Optional) which groups to extract from the input reads.
-	--compress         (Optional) Toggle compressing the output files
-                       (default: compress)
+	--groups           (Optional) Which groups to extract from the input reads.
+	--min-abundance    (Optional) Extract only groups that have a relative abundance higher than this value.
+	--compress         (Optional) Toggle compressing the output files (default: compress)
 ```
 
 
diff --git a/docs/TUTORIAL.md b/docs/TUTORIAL.md
@@ -0,0 +1,226 @@
+# Genomic epidemiology with mixed samples: a tutorial
+This tutorial constains instructions on how to repproduce the results
+of the three main synthetic experiments from *Genomic epidemiology
+with mixed samples*, Mäklin et al. 2020, in preparation.
+
+The tutorial will focus on reproducing the *Escherichia coli*
+experiment but contains instructions on how to adapt the scripts to
+the *Enterococcus faecalis* and *Staphylococcus aureus* experiments.
+
+For quick instructions on how to run the pipeline in a general
+setting, please refer to the README.md file in the root of this
+repository.
+
+## Requirements
+### mGEMS pipeline
+- [Themisto](https://github.com/algbio/Themisto)
+- [mSWEEP](https://github.com/probic/mSWEEP)
+- [mGEMS](https://github.com/probic/mGEMS)
+- [shovill](https://github.com/tseemann/shovill/)
+
+### Phylogenetic analysis
+- [snippy](https://github.com/tseemann/snippy/)
+- [RAxML-NG](https://github.com/amkozlov/raxml-ng)
+
+### Extra (macOS only)
+#### GNU coreutils
+On a macOS system, you'll also need to install GNU coreutils from
+homebrew and alias the macOS zcat command to the GNU zcat command for
+the duration of the session
+```
+brew install coreutils
+alias zcat=gzcat
+ulimit -n 2048
+```
+#### Concurrent file connections limit
+macOS also limits the number of concurrent file connections, which
+will have to be increased to run Themisto and shovill
+```
+ulimit -n 2048
+```
+
+## Tutorial
+### Table of Contents
+
+- [Select a species](#selectspecies)
+- [Reference data](#referencedata)
+- [Synthetic mixed samples](#mixedsamples)
+- [Indexing](#indexing)
+- [Pseudoalignment](#pseudoalignment)
+- [Abundance estimation](#estimation)
+- [Binning](#binning)
+- [Assembly](#assembly)
+- [SNP calling](#snpcalling)
+- [Phylogenetic inference](#phylogenetics)
+
+
+### <a name="selectspecies"></a>Select a species
+Download the supplementary table from the mGEMS manucsript which
+contains the relevant information
+```
+wget https://zenodo.org/record/3724144/files/mGEMS_Supplementary_Table_mixed_samples.tsv
+```
+Filter the table to contain only the *E. coli* (ecoli) experiments
+```
+grep "ecoli" mGEMS_Supplementary_Table_mixed_samples.tsv" > mixed_samples.tsv
+```
+If you want to reproduce the *E. faecalis* experiments, change 'ecoli'
+to 'efaec'. For *S. aureus*, change 'ecoli' to 'saur'. Running these
+other two experiments may require resources beyond the typical laptop or
+desktop computer.
+
+### <a name="referencedata"></a>Reference data
+The reference data from Mäklin et al. is available from zenodo
+- [*E. coli*](https://zenodo.org/record/3724112)
+- [*E. faecalis*](https://zenodo.org/record/3724101)
+- [*S. aureus*](https://zenodo.org/record/3724135)
+
+Construction of the reference dataset(s) is describe in more detail in
+Mäklin et al. 2020.
+
+Download and extract the *E. coli* dataset by running
+```
+wget https://zenodo.org/record/3724112/files/mGEMS-ecoli-reference-v1.0.0.tar.gz
+tar -zxvf mGEMS-ecoli-reference-v1.0.0.tar.gz
+```
+
+### <a name="indexing"></a>Indexing
+Create a *31*-mer pseudoalignment index with Themisto using two
+threads and maximum 8192 megabytes of RAM.
+```
+mkdir mGEMS-ecoli-reference
+mkdir mGEMS-ecoli-reference/tmp
+build_index --k 31 --input-file mGEMS-ecoli-reference-sequences-v1.0.0.fasta.gz --auto-colors --index-dir mGEMS-ecoli-reference --temp-dir mGEMS-ecoli-reference/tmp --mem-megas 8192 --n-threads 2
+```
+change 'ecoli' to 'efaec' or 'saur' if you are trying to reproduce the
+other experiments.
+
+### <a name="mixedsamples"></a>Synthetic mixed samples
+Download the isolate sequencing data and create the synthetic mixed
+samples by concatenating the isolate files
+```
+## Download the sequencing data and create the samples
+oldid=""
+while read line; do
+	id=$(echo $line | cut -f3 -d' ')
+	sample=$(echo $line | cut -f1 -d' ')
+	scripts/get_forward.sh $sample | gunzip -c >> $id""_1.fastq
+	scripts/get_reverse.sh $sample | gunzip -c >> $id""_2.fastq
+	if [[ "$id" != "$oldid" ]]; then
+		if [ ! -z "$oldid" -a "$oldid" != "" ]; then
+			gzip $oldid""_1.fastq
+			gzip $oldid""_2.fastq
+		fi
+	fi
+	oldid=$id
+done < mixed_samples.tsv
+gzip $oldid""_1.fastq
+gzip $oldid""_2.fastq
+```
+
+### <a name="pseudoalignment"></a>Pseudoalignment
+Align the mixed sample files against the index using two threads
+```
+for f1 in *_1.fastq.gz; do
+	f=${f1%_1.fastq.gz}
+	f2=$f""_2.fastq.gz
+	pseudoalign --query-file $f1 --outfile $f""_1.aln --index-dir mGEMS-ecoli-reference --temp-dir mGEMS-ecoli-reference/tmp --n-threads 2 --rc --sort-output --gzip-output
+	pseudoalign --query-file $f2 --outfile $f""_2.aln --index-dir mGEMS-ecoli-reference --temp-dir mGEMS-ecoli-reference/tmp --n-threads 2 --rc --sort-output --gzip-output
+done
+```
+
+### <a name="estimation"></a>Abundance estimation
+Estimate the relative abundances with mSWEEP and write the results and posterior
+probabilities using two threads
+```
+for f1 in *_1.fastq.gz; do
+	f=${f1%_1.fastq.gz}
+	mkdir $f
+	mSWEEP --themisto-1 $f""_1.aln.gz --themisto-2 $f""_2.aln.gz --themisto-index mGEMS-ecoli-reference -i mGEMS-ecoli-reference-grouping-v1.0.0.txt -o $f/$f --write-probs --gzip-probs -t 2
+done
+```
+
+### <a name="binning"></a>Binning
+Bin the reads with mGEMS and write the binned samples to the
+'ecoli-1' folder.
+```
+while read line; do
+	id=$(echo $line | cut -f3 -d' ')
+	cluster=$(echo $line | cut -f2 -d' ')
+	mGEMS --groups $cluster -r $id""_1.fastq.gz,$id""_2.fastq.gz --themisto-alns $id""_1.aln.gz,$id""_2.aln.gz -o $id --probs $id/$id""_probs.csv.gz -a $id/$id""_abundances.txt --index mGEMS-ecoli-reference
+done < mixed_samples.tsv
+```
+Note that by default mGEMS creates bins for **all** reference lineages. If know
+which lineages the samples originate from (in our case these are
+supplied in the mixed_samples.tsv in the second column), the
+'--groups' option enables you to only create those bins. Multiple
+groups can be binned in the single run by supplying them as a
+comma-separated list.
+
+### <a name="assembly"></a>Assembly
+Assemble the sequences with shovill using 2 threads and maximum of
+8192 megabytes of RAM
+```
+while read line; do
+	id=$(echo $line | cut -f3 -d' ')
+	cluster=$(echo $line | cut -f2 -d' ')
+	shovill --outdir $id/$cluster --R1 $id/$cluster""_1.fastq.gz --R2 $id/$cluster""_2.fastq.gz --cpus 2 --ram 8
+	mv $id/$cluster/contigs.fa $id/
+	rm -rf $id/$cluster
+	mkdir $id/$cluster
+	mv $id/contigs.fa $id/$cluster/
+done < mixed_samples.tsv
+```
+
+### <a name="snpcalling"></a>SNP calling
+Download the reference sequence 'NCTC13441' from the ENA
+```
+wget -O NCTC13441.fasta.gz http://ftp.ebi.ac.uk/pub/databases/ena/wgs/public/uf/UFZF01.fasta.gz
+```
+Call SNPs in the genome with snippy
+```
+mkdir snippy-tmp
+gunzip NCTC13441.fasta.gz
+while read line; do
+	id=$(echo $line | cut -f3 -d' ')
+	sample=$(echo $line | cut -f1 -d' ')
+	cluster=$(echo $line | cut -f2 -d' ')
+	snippy --outdir $id/$cluster/$sample --ctgs $id/$cluster/contigs.fa --ref NCTC13441.fasta --cpus 2 --ram 8 --tmpdir snippy-tmp
+done < mixed_samples.tsv
+gzip NCTC13441.fasta
+```
+Build the core SNP alignment with snippy
+```
+snippys=""
+while read line; do
+	id=$(echo $line | cut -f3 -d' ')
+	sample=$(echo $line | cut -f1 -d' ')
+	cluster=$(echo $line | cut -f2 -d' ')
+	snippys=$snippys""$id/$cluster/$sample" "
+	last=$id/$cluster/$sample
+done < mixed_samples.tsv
+snippy-core --ref $last/ref.fa $snippys
+```
+the alignment will be stored in the 'core.full.aln' file.
+
+### <a name="phylogenetics"></a>Phylogenetic inference
+Use RAxML-NG to infer a maximum likelihood phylogeny from 10 starting
+parsimony and random trees under the GTR+G4 model
+```
+raxml-ng --search --msa core.full.aln --prefix CT --threads 2 --tree rand{10},pars{10} --model GTR+G4
+```
+Calculate bootstrap support values with 100 replicates
+```
+raxml-ng --bootstrap --msa core.full.aln --bs-trees 100 --prefix CB --threads 2 --model GTR+G4
+```
+Perform a bootstrap convergence check
+```
+raxml-ng --bsconverge --bs-trees CB.raxml.bootstraps --prefix CS --seed 2 --threads 2
+```
+Add the bootstrap support values to the maximum likelihood tree with
+the best likelihood
+```
+raxml-ng --support --tree CT.raxml.bestTree --bs-trees CB.raxml.bootstraps --prefix CS --threads 2
+```
+The best tree with the bootstrap values will be written in the
+'CS.raxml.support' file.
diff --git a/docs/scripts/get_forward.sh b/docs/scripts/get_forward.sh
@@ -0,0 +1,24 @@
+ftppath="ftp://ftp.sra.ebi.ac.uk/vol1/fastq"
+
+strlen=$((${#1}))
+dir1="/"${1:0:6}
+
+if [ ${strlen} -lt 10 ]
+then
+dir2="/"
+elif [ ${strlen} -lt 11 ]
+then
+dir2="/00"${1: -1}"/"
+elif [ ${strlen} -lt 12 ]
+then
+dir2="/0"${1: -2}"/"
+elif [ ${strlen} -lt 13 ]
+then
+dir2="/"${1: -3}"/"
+else
+echo "check accession number"
+fi
+
+dlpath=$ftppath$dir1$dir2$1"/"
+
+wget -qO- $dlpath""$1_1.fastq.gz
diff --git a/docs/scripts/get_reverse.sh b/docs/scripts/get_reverse.sh
@@ -0,0 +1,24 @@
+ftppath="ftp://ftp.sra.ebi.ac.uk/vol1/fastq"
+
+strlen=$((${#1}))
+dir1="/"${1:0:6}
+
+if [ ${strlen} -lt 10 ]
+then
+dir2="/"
+elif [ ${strlen} -lt 11 ]
+then
+dir2="/00"${1: -1}"/"
+elif [ ${strlen} -lt 12 ]
+then
+dir2="/0"${1: -2}"/"
+elif [ ${strlen} -lt 13 ]
+then
+dir2="/"${1: -3}"/"
+else
+echo "check accession number"
+fi
+
+dlpath=$ftppath$dir1$dir2$1"/"
+
+wget -qO- $dlpath""$1_2.fastq.gz
diff --git a/src/main.cpp b/src/main.cpp
@@ -39,9 +39,11 @@ void ParseBin(int argc, char* argv[], cxxargs::Arguments &args) {
   args.add_long_argument<std::string>("probs", "Posterior probabilities from mSWEEP.");
   args.add_long_argument<std::string>("merge-mode", "How to merge paired-end alignments from Themisto (default: intersection).", "intersection");
   args.add_long_argument<std::vector<std::string>>("groups", "Which reference groups to bin reads to (default: all).");
+  args.add_long_argument<long double>("min-abundance", "Bin only the groups that have a relative abundance higher than this value (optional).");
   args.add_short_argument<long double>('q', "Tuning parameter for the binning thresholds (default: 1.0).", (long double)1);
   args.add_long_argument<std::string>("index", "Themisto pseudoalignment index directory.");
   args.set_not_required("groups");
+  args.set_not_required("min-abundance");
 
   args.parse(argc, argv);
 }
@@ -91,6 +93,15 @@ void ReadAndExtract(cxxargs::Arguments &args) {
   Extract(bins, target_groups, args);
 }
 
+void FilterTargetGroups(const std::vector<std::string> &group_names, const std::vector<long double> &abundances, const long double min_abundance, std::vector<std::string> *target_groups) {
+  uint32_t n_groups = group_names.size();
+  for (uint32_t i = 0; i < n_groups; ++i) {
+    if (abundances[i] < min_abundance && std::find(target_groups->begin(), target_groups->end(), group_names[i]) != target_groups->end()) {
+      target_groups->erase(std::find(target_groups->begin(), target_groups->end(), group_names[i]));
+    }
+  }
+}
+
 void Bin(const cxxargs::Arguments &args, bool extract_bins) {
   DIR* dir = opendir(args.value<std::string>('o').c_str());
   if (dir) {
@@ -128,6 +139,10 @@ void Bin(const cxxargs::Arguments &args, bool extract_bins) {
   } else {
     target_groups = groups;
   }
+  if (args.is_initialized("min-abundance")) {
+    FilterTargetGroups(groups, abundances, args.value<long double>("min-abundance"), &target_groups);
+  }
+
   const std::vector<std::vector<uint32_t>> &bins = mGEMS::Bin(aln, args.value<long double>('q'), abundances, groups, probs_file.stream(), &target_groups);
   if (!extract_bins) {
     uint32_t n_bins = bins.size();