Skip to content

Commit f87f7c6

Browse files
authored
Merge pull request #2 from COMBINE-lab/develop
Merge develop into main for 0.6.0 release
2 parents 7a3f43f + bd71b48 commit f87f7c6

18 files changed

+655
-2
lines changed

bin/pyroe

+31
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,11 @@
11
#!/usr/bin/env python
22

3+
import logging
4+
35
from pyroe import make_splici_txome
46
from pyroe import fetch_processed_quant
57
from pyroe import convert
8+
from pyroe import id_to_name
69
from pyroe import output_formats
710

811
if __name__ == "__main__":
@@ -137,6 +140,24 @@ if __name__ == "__main__":
137140
help="A flag indicates whether help messaged should not be printed.",
138141
)
139142

143+
144+
parser_id_to_name = subparsers.add_parser(
145+
"id-to-name", help="Generate a gene id to gene name mapping file from a GTF."
146+
)
147+
parser_id_to_name.add_argument(
148+
"gtf_file",
149+
help="The GTF input file."
150+
)
151+
parser_id_to_name.add_argument(
152+
"output",
153+
help="The path to where the output tsv file will be written."
154+
)
155+
parser_id_to_name.add_argument(
156+
"--format",
157+
help="The input format of the file (must be either GTF or GFF3). This will be inferred from the filename, but if that fails it can be provided explicitly.",
158+
default=None
159+
)
160+
140161
out_formats = output_formats()
141162
parser_convert = subparsers.add_parser(
142163
"convert", help="Convert alevin-fry quantification result to another format."
@@ -160,6 +181,14 @@ if __name__ == "__main__":
160181
default="h5ad",
161182
help=f"The format in which the output should be written, one of {out_formats}.",
162183
)
184+
parser_convert.add_argument(
185+
"--geneid-to-name",
186+
type=str,
187+
required=False,
188+
help="A 2 column tab-separated list of gene ID to gene name mappings. Providing this file will project gene IDs to gene names in the output."
189+
)
190+
191+
logging.basicConfig(level=logging.INFO)
163192

164193
# Execute the parse_args() method
165194
args = parser.parse_args()
@@ -188,6 +217,8 @@ if __name__ == "__main__":
188217
)
189218
elif args.command == "convert":
190219
convert(args)
220+
elif args.command == "id-to-name":
221+
id_to_name(args)
191222
else:
192223
print(parser.print_help())
193224
sys.exit(1)

docs/Makefile

+20
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
# Minimal makefile for Sphinx documentation
2+
#
3+
4+
# You can set these variables from the command line, and also
5+
# from the environment for the first two.
6+
SPHINXOPTS ?=
7+
SPHINXBUILD ?= sphinx-build
8+
SOURCEDIR = source
9+
BUILDDIR = build
10+
11+
# Put it first so that "make" without argument is like "make help".
12+
help:
13+
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14+
15+
.PHONY: help Makefile
16+
17+
# Catch-all target: route all unknown targets to Sphinx using the new
18+
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
19+
%: Makefile
20+
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)

docs/logo.png

47.5 KB
Loading

docs/make.bat

+35
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
@ECHO OFF
2+
3+
pushd %~dp0
4+
5+
REM Command file for Sphinx documentation
6+
7+
if "%SPHINXBUILD%" == "" (
8+
set SPHINXBUILD=sphinx-build
9+
)
10+
set SOURCEDIR=source
11+
set BUILDDIR=build
12+
13+
if "%1" == "" goto help
14+
15+
%SPHINXBUILD% >NUL 2>NUL
16+
if errorlevel 9009 (
17+
echo.
18+
echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19+
echo.installed, then set the SPHINXBUILD environment variable to point
20+
echo.to the full path of the 'sphinx-build' executable. Alternatively you
21+
echo.may add the Sphinx directory to PATH.
22+
echo.
23+
echo.If you don't have Sphinx installed, grab it from
24+
echo.http://sphinx-doc.org/
25+
exit /b 1
26+
)
27+
28+
%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29+
goto end
30+
31+
:help
32+
%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33+
34+
:end
35+
popd

docs/requirements.txt

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
#######
2+
#
3+
###### Custom themes (no version specifiers) ######
4+
furo

docs/source/LICENSE.rst

+32
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
License
2+
=======
3+
4+
BSD 3-Clause License
5+
6+
Copyright (c) 2020, Mohsen Zakeri, Avi Srivastava, Hirak Sarkar, Dongze He, Rob Patro
7+
All rights reserved.
8+
9+
Redistribution and use in source and binary forms, with or without
10+
modification, are permitted provided that the following conditions are met:
11+
12+
1. Redistributions of source code must retain the above copyright notice, this
13+
list of conditions and the following disclaimer.
14+
15+
2. Redistributions in binary form must reproduce the above copyright notice,
16+
this list of conditions and the following disclaimer in the documentation
17+
and/or other materials provided with the distribution.
18+
19+
3. Neither the name of the copyright holder nor the names of its
20+
contributors may be used to endorse or promote products derived from
21+
this software without specific prior written permission.
22+
23+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
24+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
26+
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
27+
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
29+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
30+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
31+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32+
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

docs/source/building_splici_index.rst

+60
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
Preparing a splici index for quantification with alevin-fry
2+
===========================================================
3+
4+
The USA mode in alevin-fry requires a special index reference, which is called the *splici* reference. The *splici* reference contains the spliced transcripts plus the intronic sequences of each gene. The ``make_splici_txome()`` function is designed to make the *splici* reference by taking a genome FASTA file and a gene annotation GTF file as the input. Details about the *splici* can be found in Section S2 of the supplementary file of the `alevin-fry paper <https://www.nature.com/articles/s41592-022-01408-3>`_. To run ``pyroe``, you also need to specify the read length argument ``read_length`` of the experiment you are working on and the flank trimming length ``flank_trim_length``. A final flank length will be computed as the difference between the read_length and flank trimming length and will be attached to the ends of each intron to absorb the intron-exon junctional reads.
5+
6+
Following is an example of calling the `pyroe` to make the *splici* index reference. The final flank length is calculated as the difference between the read length and the flank_trim_length, i.e., 5-2=3. This function allows you to add extra spliced and unspliced sequences to the *splici* index, which will be useful when some unannotated sequences, such as mitochondrial genes, are important for your experiment. **Note** : to make `pyroe` work more quickly, it is recommended to have the latest version of `bedtools <https://bedtools.readthedocs.io/en/latest/>`_ (`Aaron R. Quinlan and Ira M. Hall, 2010 <https://doi.org/10.1093/bioinformatics/btq033>`_) installed.
7+
8+
.. code:: bash
9+
10+
pyroe make-splici extdata/small_example_genome.fa extdata/small_example.gtf 5 splici_txome --flank-trim-length 2 --filename-prefix transcriptome_splici --dedup-seqs
11+
12+
13+
The `pyroe` program writes two files to your specified output directory `output_dir`. They are
14+
* A FASTA file that stores the extracted splici sequences.
15+
* A three columns' transcript-name-to-gene-name file that stores the name of each transcript in the splici index reference, their corresponding gene name, and the splicing status (`S` for spliced and `U` for unspliced) of those transcripts.
16+
17+
Full usage
18+
==========
19+
20+
.. code:: bash
21+
22+
usage: pyroe make-splici [-h] [--filename-prefix FILENAME_PREFIX]
23+
[--flank-trim-length FLANK_TRIM_LENGTH]
24+
[--extra-spliced EXTRA_SPLICED]
25+
[--extra-unspliced EXTRA_UNSPLICED]
26+
[--bt-path BT_PATH] [--dedup-seqs] [--no-bt]
27+
[--no-flanking-merge]
28+
genome-path gtf-path read-length output-dir
29+
30+
positional arguments:
31+
genome-path The path to a gtf file.
32+
gtf-path The path to a gtf file.
33+
read-length The read length of the single-cell experiment
34+
being processed (determines flank size).
35+
output-dir The output directory where splici reference
36+
files will be written.
37+
38+
optional arguments:
39+
-h, --help show this help message and exit
40+
--filename-prefix FILENAME_PREFIX
41+
The file name prefix of the generated output files.
42+
--flank-trim-length FLANK_TRIM_LENGTH
43+
Determines the amount subtracted from the read length
44+
to get the flank length.
45+
--extra-spliced EXTRA_SPLICED
46+
The path to an extra spliced sequence fasta file.
47+
--extra-unspliced EXTRA_UNSPLICED
48+
The path to an extra unspliced sequence fasta file.
49+
--bt-path BT_PATH The path to bedtools v2.30.0 or greater.
50+
--dedup-seqs A flag indicates whether identical sequences will be
51+
deduplicated.
52+
--no-bt A flag indicates whether to disable bedtools.
53+
--no-flanking-merge A flag indicates whether introns will be merged after
54+
adding flanking length.
55+
56+
57+
The *splici* index
58+
------------------
59+
60+
The *splici* index of a given species consists of the transcriptome of the species, i.e., the spliced transcripts, and the intronic sequences of the species. Within a gene, if the flanked intronic sequences overlap with each other, the overlapped intronic sequences will be collapsed as a single intronic sequence to make sure each base will appear only once in the intronic sequences. For more detailed information, please check the section S2 in the supplementary file of `alevin-fry manuscript <https://www.biorxiv.org/content/10.1101/2021.06.29.450377v2>`_.

docs/source/conf.py

+57
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
# Configuration file for the Sphinx documentation builder.
2+
#
3+
# This file only contains a selection of the most common options. For a full
4+
# list see the documentation:
5+
# https://www.sphinx-doc.org/en/master/usage/configuration.html
6+
7+
# -- Path setup --------------------------------------------------------------
8+
9+
# If extensions (or modules to document with autodoc) are in another directory,
10+
# add these directories to sys.path here. If the directory is relative to the
11+
# documentation root, use os.path.abspath to make it absolute, like shown here.
12+
#
13+
# import os
14+
# import sys
15+
# sys.path.insert(0, os.path.abspath('.'))
16+
17+
18+
# -- Project information -----------------------------------------------------
19+
20+
project = "pyroe"
21+
copyright = "2021-2022, Dongze He, Rob Patro"
22+
author = "Dongze He, Rob Patro"
23+
24+
# The full version, including alpha/beta/rc tags
25+
release = "0.6.0"
26+
27+
master_doc = "index"
28+
29+
# -- General configuration ---------------------------------------------------
30+
31+
# Add any Sphinx extension module names here, as strings. They can be
32+
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
33+
# ones.
34+
extensions = ["sphinx.ext.autosectionlabel"]
35+
36+
# Add any paths that contain templates here, relative to this directory.
37+
templates_path = ["_templates"]
38+
39+
# List of patterns, relative to source directory, that match files and
40+
# directories to ignore when looking for source files.
41+
# This pattern also affects html_static_path and html_extra_path.
42+
exclude_patterns = []
43+
44+
45+
# -- Options for HTML output -------------------------------------------------
46+
47+
# The theme to use for HTML and HTML Help pages. See the documentation for
48+
# a list of builtin themes.
49+
#
50+
html_theme = "furo"
51+
52+
html_logo = "../logo.png"
53+
54+
# Add any paths that contain custom static files (such as style sheets) here,
55+
# relative to this directory. They are copied after the builtin static files,
56+
# so a file named "default.css" will overwrite the builtin "default.css".
57+
# html_static_path = ["_static"]

docs/source/converting_quants.rst

+43
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
Converting quantification results
2+
=================================
3+
4+
The ``convert`` sub-command of ``pyroe`` can convert the output of `alevin-fry` into several common formats, such as
5+
the native `AnnData` format (``h5ad``). Further, when performing this conversion, it can organize the unspliced,
6+
spliced, and ambiguous counts as desired by the user.
7+
8+
The sub-command takes as input a quantification directory produced by ``alevin-fry``, and an output location.
9+
Additionally, the user should pass in command line parameters to describe the desired output structure, and
10+
output format. The output structure defines how the ``U``, ``S``, and ``A`` layers of the input quantification should
11+
be represented in the converted matrix. The syntax for this flag exactly mimics the ``output_format`` argument of
12+
the ``load_fry`` function, which you can read about `here <https://pyroe.readthedocs.io/en/latest/building_splici_index.html#load-fry-notes>`_.
13+
Note that, if you pass in a custom output structure, you should enclose your format description in quotes. For
14+
example, to output to an object where the "main" layer (``X``) contains the sum of ``U``, ``S``, and ``A``, and where
15+
there is an additional layer named `unspliced` having just the unspliced counts, you would pass
16+
``--output-structure '{ "X" : ["U", "S", "A"], "unspliced" : ["U"]}'``.
17+
18+
If you do not explicitly provide an ``--output-format``, the default of ``h5ad`` will be used.
19+
20+
The *optional* ``--geneid-to-name`` parameter allows you to pass in a 2-column tab-separated filed mapping gene identifiers to gene names.
21+
If this is provided, then gene IDs will be converted to gene names in the output matrix. Gene names will be made unique using the ``var_names_make_unique()`` function of `ScanPy <https://scanpy-tutorials.readthedocs.io/en/latest/index.html>`_.
22+
It is also possible that some gene IDs do not have a mapped name. In this case, the ``convert`` subcommand will also write out a JSON format file, at the provided output path, with the additional suffix ``_unmapped_ids.json``.
23+
This file contains a list of the gene IDs that could not successfully be mapped to a name given the provided mapping.
24+
25+
``convert`` command full usage
26+
------------------------------
27+
28+
.. code:: bash
29+
30+
usage: pyroe convert [-h] [--output-structure OUTPUT_STRUCTURE] [--output-format OUTPUT_FORMAT] [--geneid-to-name GENEID_TO_NAME] quant_dir output
31+
32+
positional arguments:
33+
quant_dir The input quantification directory containing the matrix to be converted.
34+
output The output name where the quantification matrix should be written. For `csvs` output format, this will be a directory. For all others, it will be a file.
35+
36+
optional arguments:
37+
-h, --help show this help message and exit
38+
--output-structure OUTPUT_STRUCTURE
39+
The structure that U,S and A counts should occupy in the output matrix.
40+
--output-format OUTPUT_FORMAT
41+
The format in which the output should be written, one of {'zarr', 'loom', 'csvs', 'h5ad'}.
42+
--geneid-to-name GENEID_TO_NAME
43+
A 2 column tab-separated list of gene ID to gene name mappings. Providing this file will project gene IDs to gene names in the output.

0 commit comments

Comments
 (0)