Small bug fixes and text corrections

jfnavarro · jfnavarro · commit b3000fcec202 · 2021-01-13T11:46:25.000+01:00
diff --git a/README.md b/README.md
@@ -106,7 +106,7 @@ If you use anaconda you can install Samtools with
 
     conda install -c bioconda samtools openssl=1.0
 
-The ST Pipeline recommends a computer with at least 32GB of RAM (depending on the size of the genome) and 8 cpu cores. 
+The ST Pipeline needs a computer with at least 32GB of RAM (depending on the size of the genome) and 8 cpu cores. 
 
 **Dependencies** 
 
diff --git a/README_SHORT b/README_SHORT
@@ -53,6 +53,4 @@ Basically what the ST pipeline does is:
 You can see a graphical more detailed description of the workflow in the documents workflow.pdf and workflow_extended.pdf
 
 The output will be a matrix of counts (genes as columns, spots as rows),
-a BED file containing the transcripts (Read name, coordinate, gene, etc..), and a JSON
-file with useful stats.
-The ST pipeline will also output a log file with useful information.
+The ST pipeline will also output a log file with useful information and stats.
diff --git a/docsrc/changes.rst b/docsrc/changes.rst
@@ -1,6 +1,11 @@
 Changes
 -------
 
+**Version 1.8.1**
+
+* Fixed a bug when having barcodes after the UMI
+* Improved descriptions for parameters
+
 **Version 1.8.0**
 
 * Improved the unit-tests
diff --git a/docsrc/example.rst b/docsrc/example.rst
@@ -43,7 +43,7 @@ If you want to process Visium datasets it is recommended to use these settings
 .. code-block:: bash
 
     --allowed-missed 1 \
-  	--allowed-kmer 4 \
+    --allowed-kmer 4 \
   	--umi-allowed-mismatches 2 \
   	--umi-start-position 16 \
   	--umi-end-position 28 \
diff --git a/docsrc/intro.rst b/docsrc/intro.rst
@@ -39,12 +39,12 @@ The input FASTQ files can be given in gzip/bzip format as well.
 Basically what the ST pipeline does is:
 
 - Quality trimming (read 1 and read 2):
-	- Remove low quality bases
-	- Sanity check (reads same length, reads order, etc..)
-	- Check quality UMI (if provided)
-	- Remove artifacts (PolyT, PolyA, PolyG, PolyN and PolyC) of user defined length
-	- Check for AT and GC content
-	- Discard reads with a minimum number of bases of that failed any of the checks above
+    - Remove low quality bases
+    - Sanity check (reads same length, reads order, etc..)
+    - Check quality UMI (if provided)
+    - Remove artifacts (PolyT, PolyA, PolyG, PolyN and PolyC) of user defined length
+    - Check for AT and GC content
+    - Discard reads with a minimum number of bases of that failed any of the checks above
 - Contamimant filter e.x. rRNA genome (Optional)
 - Mapping with STAR (only read 2)
 - Demultiplexing with [Taggd](https://github.com/SpatialTranscriptomicsResearch/taggd) (only read 1)
@@ -55,7 +55,5 @@ Basically what the ST pipeline does is:
 
 You can see a graphical more detailed description of the workflow in the documents workflow.pdf and workflow_extended.pdf
 
-The output will be a matrix of counts (genes as columns, spots as rows),
-a BED file containing the transcripts (Read name, coordinate, gene, etc..), and a JSON
-file with useful stats.
-The ST pipeline will also output a log file with useful information.
+The output will be a matrix of counts (genes as columns, spots as rows)
+and a log file with useful information and stats.
diff --git a/docsrc/manual.rst b/docsrc/manual.rst
diff --git a/scripts/st_qa.py b/scripts/st_qa.py
@@ -120,16 +120,16 @@ def main(input_data):
     plt.clf()
 
     # Generate density plots
-    sns.displot(aggregated_gene_counts, hist=False, label="Counts > 0")
-    sns.displot(aggregated_gene_counts_1, hist=False, label="Counts > 1")
+    sns.distplot(aggregated_gene_counts, hist=False, label="Counts > 0")
+    sns.distplot(aggregated_gene_counts_1, hist=False, label="Counts > 1")
     sns_plot = sns.distplot(aggregated_gene_counts_2,
                             axlabel="#Genes", hist=False, label="Counts > 2")
     fig = sns_plot.get_figure()
     fig.savefig(input_name + "_density_genes_by_spot.pdf")
     plt.clf()
 
-    sns.displot(aggregated_gene_gene_counts, hist=False, label="Counts > 0")
-    sns.displot(aggregated_gene_gene_counts_1, hist=False, label="Counts > 1")
+    sns.distplot(aggregated_gene_gene_counts, hist=False, label="Counts > 0")
+    sns.distplot(aggregated_gene_gene_counts_1, hist=False, label="Counts > 1")
     sns_plot = sns.distplot(aggregated_gene_gene_counts_2,
                             axlabel="#Spots", hist=False, label="Counts > 2")
     fig = sns_plot.get_figure()
diff --git a/stpipeline/core/mapping.py b/stpipeline/core/mapping.py
@@ -205,7 +205,6 @@ def barcodeDemultiplexing(reads,
                           idFile,
                           mismatches,
                           kmer,
-                          start_positon,
                           over_hang,
                           taggd_metric,
                           taggd_multiple_hits_keep_one,
@@ -223,7 +222,6 @@ def barcodeDemultiplexing(reads,
     :param idFile: a tab delimited file (BARCODE - X - Y) containing all the barcodes
     :param mismatches: the number of allowed mismatches
     :param kmer: the kmer length
-    :param start_positon: the start position of the barcode
     :param over_hang: the number of bases to allow for overhang
     :param taggd_metric: the distance metric algorithm (Subglobal, Levensthein or Hamming)
     :param taggd_multiple_hits_keep_one: when True keep one random hit when multiple candidates
@@ -234,7 +232,6 @@ def barcodeDemultiplexing(reads,
     :type idFile: str
     :type mismatches: int
     :type kmer: int
-    :type start_positon: int
     :type over_hang: int
     :type taggd_metric: str
     :type taggd_multiple_hits_keep_one: bool
@@ -271,13 +268,12 @@ def barcodeDemultiplexing(reads,
 
     args += ["--max-edit-distance", mismatches,
              "--k", kmer,
-             "--barcode-tag", "B0",  # if input is BAM we tell taggd what tag contains the barcode
-             "--start-position", start_positon,
+             "--barcode-tag", "B0",  # if input is BAM we tell taggd which tag contains the barcode
              "--homopolymer-filter", 0,
              "--subprocesses", cores,
              "--metric", taggd_metric,
-             "--overhang", over_hang]  # ,
-    # '--use-samtools-merge'] # Could be added to merge using samtools instead of pysam WIP on taggd
+             "--overhang", over_hang]
+    # --use-samtools-merge Could be added to merge using samtools instead of pysam WIP on taggd
 
     if taggd_multiple_hits_keep_one:
         args.append("--multiple-hits-keep-one")
diff --git a/stpipeline/core/pipeline.py b/stpipeline/core/pipeline.py
diff --git a/testrun.py b/testrun.py