COMBINE-lab
diff --git a/‎bin/pyroe
Lines changed: 31 additions & 0 deletions b/‎bin/pyroe
Lines changed: 31 additions & 0 deletions
diff --git a/‎docs/Makefile
Lines changed: 20 additions & 0 deletions b/‎docs/Makefile
Lines changed: 20 additions & 0 deletions
diff --git a/‎docs/logo.png
47.5 KB b/‎docs/logo.png
47.5 KB
diff --git a/‎docs/make.bat
Lines changed: 35 additions & 0 deletions b/‎docs/make.bat
Lines changed: 35 additions & 0 deletions
diff --git a/‎docs/requirements.txt
Lines changed: 4 additions & 0 deletions b/‎docs/requirements.txt
Lines changed: 4 additions & 0 deletions
diff --git a/‎docs/source/LICENSE.rst
Lines changed: 32 additions & 0 deletions b/‎docs/source/LICENSE.rst
Lines changed: 32 additions & 0 deletions
diff --git a/‎docs/source/building_splici_index.rst
Lines changed: 60 additions & 0 deletions b/‎docs/source/building_splici_index.rst
Lines changed: 60 additions & 0 deletions
diff --git a/‎docs/source/conf.py
Lines changed: 57 additions & 0 deletions b/‎docs/source/conf.py
Lines changed: 57 additions & 0 deletions
diff --git a/‎docs/source/converting_quants.rst
Lines changed: 43 additions & 0 deletions b/‎docs/source/converting_quants.rst
Lines changed: 43 additions & 0 deletions
@@ -1,8 +1,11 @@
 #!/usr/bin/env python
 
+import logging
+
 from pyroe import make_splici_txome
 from pyroe import fetch_processed_quant
 from pyroe import convert
+from pyroe import id_to_name
 from pyroe import output_formats
 
 if __name__ == "__main__":
@@ -137,6 +140,24 @@ if __name__ == "__main__":
         help="A flag indicates whether help messaged should not be printed.",
     )
 
+
+    parser_id_to_name = subparsers.add_parser(
+        "id-to-name", help="Generate a gene id to gene name mapping file from a GTF."
+    )
+    parser_id_to_name.add_argument(
+        "gtf_file",
+        help="The GTF input file."
+    )
+    parser_id_to_name.add_argument(
+        "output",
+        help="The path to where the output tsv file will be written."
+    )
+    parser_id_to_name.add_argument(
+        "--format",
+        help="The input format of the file (must be either GTF or GFF3). This will be inferred from the filename, but if that fails it can be provided explicitly.",
+        default=None
+    )
+
     out_formats = output_formats()
     parser_convert = subparsers.add_parser(
         "convert", help="Convert alevin-fry quantification result to another format."
@@ -160,6 +181,14 @@ if __name__ == "__main__":
         default="h5ad",
         help=f"The format in which the output should be written, one of {out_formats}.",
     )
+    parser_convert.add_argument(
+        "--geneid-to-name",
+        type=str,
+        required=False,
+        help="A 2 column tab-separated list of gene ID to gene name mappings. Providing this file will project gene IDs to gene names in the output."
+    )
+
+    logging.basicConfig(level=logging.INFO)
 
     # Execute the parse_args() method
     args = parser.parse_args()
@@ -188,6 +217,8 @@ if __name__ == "__main__":
         )
     elif args.command == "convert":
         convert(args)
+    elif args.command == "id-to-name":
+        id_to_name(args)
     else:
         print(parser.print_help())
         sys.exit(1)
@@ -0,0 +1,20 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line, and also
+# from the environment for the first two.
+SPHINXOPTS    ?=
+SPHINXBUILD   ?= sphinx-build
+SOURCEDIR     = source
+BUILDDIR      = build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
@@ -0,0 +1,35 @@
+@ECHO OFF
+
+pushd %~dp0
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+	set SPHINXBUILD=sphinx-build
+)
+set SOURCEDIR=source
+set BUILDDIR=build
+
+if "%1" == "" goto help
+
+%SPHINXBUILD% >NUL 2>NUL
+if errorlevel 9009 (
+	echo.
+	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+	echo.installed, then set the SPHINXBUILD environment variable to point
+	echo.to the full path of the 'sphinx-build' executable. Alternatively you
+	echo.may add the Sphinx directory to PATH.
+	echo.
+	echo.If you don't have Sphinx installed, grab it from
+	echo.http://sphinx-doc.org/
+	exit /b 1
+)
+
+%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+goto end
+
+:help
+%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+
+:end
+popd
@@ -0,0 +1,4 @@
+####### 
+#
+###### Custom themes (no version specifiers) ######
+furo
@@ -0,0 +1,32 @@
+License
+=======
+
+BSD 3-Clause License
+
+Copyright (c) 2020, Mohsen Zakeri, Avi Srivastava, Hirak Sarkar, Dongze He, Rob Patro
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+   contributors may be used to endorse or promote products derived from
+   this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
@@ -0,0 +1,60 @@
+Preparing a splici index for quantification with alevin-fry
+===========================================================
+
+The USA mode in alevin-fry requires a special index reference, which is called the *splici* reference. The *splici* reference contains the spliced transcripts plus the intronic sequences of each gene. The ``make_splici_txome()`` function is designed to make the *splici* reference by taking a genome FASTA file and a gene annotation GTF file as the input. Details about the *splici* can be found in Section S2 of the supplementary file of the `alevin-fry paper <https://www.nature.com/articles/s41592-022-01408-3>`_. To run ``pyroe``, you also need to specify the read length argument ``read_length`` of the experiment you are working on and the flank trimming length ``flank_trim_length``. A final flank length will be computed as the difference between the read_length and flank trimming length and will be attached to the ends of each intron to absorb the intron-exon junctional reads.
+
+Following is an example of calling the `pyroe` to make the *splici* index reference. The final flank length is calculated as the difference between the read length and the flank_trim_length, i.e., 5-2=3. This function allows you to add extra spliced and unspliced sequences to the *splici* index, which will be useful when some unannotated sequences, such as mitochondrial genes, are important for your experiment. **Note** : to make `pyroe` work more quickly, it is recommended to have the latest version of `bedtools <https://bedtools.readthedocs.io/en/latest/>`_ (`Aaron R. Quinlan and Ira M. Hall, 2010 <https://doi.org/10.1093/bioinformatics/btq033>`_) installed.
+
+.. code:: bash
+
+  pyroe make-splici extdata/small_example_genome.fa extdata/small_example.gtf 5 splici_txome --flank-trim-length 2 --filename-prefix transcriptome_splici --dedup-seqs
+        
+        
+The `pyroe` program writes two files to your specified output directory `output_dir`. They are 
+* A FASTA file that stores the extracted splici sequences.
+* A three columns' transcript-name-to-gene-name file that stores the name of each transcript in the splici index reference, their corresponding gene name, and the splicing status (`S` for spliced and `U` for unspliced) of those transcripts.
+
+Full usage
+==========
+
+.. code:: bash 
+
+  usage: pyroe make-splici [-h] [--filename-prefix FILENAME_PREFIX]
+                           [--flank-trim-length FLANK_TRIM_LENGTH]
+                           [--extra-spliced EXTRA_SPLICED]
+                           [--extra-unspliced EXTRA_UNSPLICED]
+                           [--bt-path BT_PATH] [--dedup-seqs] [--no-bt]
+                           [--no-flanking-merge]
+                           genome-path gtf-path read-length output-dir
+
+  positional arguments:
+    genome-path           The path to a gtf file.
+    gtf-path              The path to a gtf file.
+    read-length           The read length of the single-cell experiment 
+                            being processed (determines flank size).
+    output-dir            The output directory where splici reference 
+                            files will be written.
+
+  optional arguments:
+    -h, --help            show this help message and exit
+    --filename-prefix FILENAME_PREFIX
+                          The file name prefix of the generated output files.
+    --flank-trim-length FLANK_TRIM_LENGTH
+                          Determines the amount subtracted from the read length
+                          to get the flank length.
+    --extra-spliced EXTRA_SPLICED
+                          The path to an extra spliced sequence fasta file.
+    --extra-unspliced EXTRA_UNSPLICED
+                          The path to an extra unspliced sequence fasta file.
+    --bt-path BT_PATH     The path to bedtools v2.30.0 or greater.
+    --dedup-seqs          A flag indicates whether identical sequences will be
+                            deduplicated.
+    --no-bt               A flag indicates whether to disable bedtools.
+    --no-flanking-merge   A flag indicates whether introns will be merged after
+                            adding flanking length.
+
+
+The *splici* index
+------------------
+
+The *splici* index of a given species consists of the transcriptome of the species, i.e., the spliced transcripts, and the intronic sequences of the species. Within a gene, if the flanked intronic sequences overlap with each other, the overlapped intronic sequences will be collapsed as a single intronic sequence to make sure each base will appear only once in the intronic sequences. For more detailed information, please check the section S2 in the supplementary file of `alevin-fry manuscript <https://www.biorxiv.org/content/10.1101/2021.06.29.450377v2>`_.
@@ -0,0 +1,57 @@
+# Configuration file for the Sphinx documentation builder.
+#
+# This file only contains a selection of the most common options. For a full
+# list see the documentation:
+# https://www.sphinx-doc.org/en/master/usage/configuration.html
+
+# -- Path setup --------------------------------------------------------------
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#
+# import os
+# import sys
+# sys.path.insert(0, os.path.abspath('.'))
+
+
+# -- Project information -----------------------------------------------------
+
+project = "pyroe"
+copyright = "2021-2022, Dongze He, Rob Patro"
+author = "Dongze He, Rob Patro"
+
+# The full version, including alpha/beta/rc tags
+release = "0.6.0"
+
+master_doc = "index"
+
+# -- General configuration ---------------------------------------------------
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = ["sphinx.ext.autosectionlabel"]
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ["_templates"]
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This pattern also affects html_static_path and html_extra_path.
+exclude_patterns = []
+
+
+# -- Options for HTML output -------------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+#
+html_theme = "furo"
+
+html_logo = "../logo.png"
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+# html_static_path = ["_static"]
@@ -0,0 +1,43 @@
+Converting quantification results
+=================================
+
+The ``convert`` sub-command of ``pyroe`` can convert the output of `alevin-fry` into several common formats, such as 
+the native `AnnData` format (``h5ad``).  Further, when performing this conversion, it can organize the unspliced, 
+spliced, and ambiguous counts as desired by the user.
+
+The sub-command takes as input a quantification directory produced by ``alevin-fry``, and an output location.
+Additionally, the user should pass in command line parameters to describe the desired output structure, and
+output format. The output structure defines how the ``U``, ``S``, and ``A`` layers of the input quantification should
+be represented in the converted matrix.  The syntax for this flag exactly mimics the ``output_format`` argument of
+the ``load_fry`` function, which you can read about `here <https://pyroe.readthedocs.io/en/latest/building_splici_index.html#load-fry-notes>`_.
+Note that, if you pass in a custom output structure, you should enclose your format description in quotes.  For
+example, to output to an object where the "main" layer (``X``) contains the sum of ``U``, ``S``, and ``A``, and where
+there is an additional layer named `unspliced` having just the unspliced counts, you would pass
+``--output-structure '{ "X" : ["U", "S", "A"], "unspliced" : ["U"]}'``. 
+
+If you do not explicitly provide an ``--output-format``, the default of ``h5ad`` will be used.
+
+The *optional* ``--geneid-to-name`` parameter allows you to pass in a 2-column tab-separated filed mapping gene identifiers to gene names.
+If this is provided, then gene IDs will be converted to gene names in the output matrix.  Gene names will be made unique using the ``var_names_make_unique()`` function of `ScanPy <https://scanpy-tutorials.readthedocs.io/en/latest/index.html>`_.
+It is also possible that some gene IDs do not have a mapped name.  In this case, the ``convert`` subcommand will also write out a JSON format file, at the provided output path, with the additional suffix ``_unmapped_ids.json``.
+This file contains a list of the gene IDs that could not successfully be mapped to a name given the provided mapping.
+
+``convert`` command full usage
+------------------------------
+
+.. code:: bash
+
+  usage: pyroe convert [-h] [--output-structure OUTPUT_STRUCTURE] [--output-format OUTPUT_FORMAT] [--geneid-to-name GENEID_TO_NAME] quant_dir output
+
+  positional arguments:
+    quant_dir             The input quantification directory containing the matrix to be converted.
+    output                The output name where the quantification matrix should be written. For `csvs` output format, this will be a directory. For all others, it will be a file.
+
+  optional arguments:
+    -h, --help            show this help message and exit
+    --output-structure OUTPUT_STRUCTURE
+                          The structure that U,S and A counts should occupy in the output matrix.
+    --output-format OUTPUT_FORMAT
+                          The format in which the output should be written, one of {'zarr', 'loom', 'csvs', 'h5ad'}.
+    --geneid-to-name GENEID_TO_NAME
+                          A 2 column tab-separated list of gene ID to gene name mappings. Providing this file will project gene IDs to gene names in the output.
-Original file line number
+Diff line change
@@ @@ -0,0 +1,4 @@ @@
 +#######
 +#
 +###### Custom themes (no version specifiers) ######
 +furo