Skip to content

Commit a39ae4f

Browse files
Alternative ways to get genomic positions (#150)
* Update README.md * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Make gtfparse optional * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Draft function to get genomic position from biomart * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Add gtf testdata * WIP * Update tutorial * Add unit test for gtfs * use UV in CI * pre-commit * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * remove gtfparse from hard dependencies * update api doc * Fix link in README * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
1 parent 842ab1f commit a39ae4f

File tree

12 files changed

+4416
-23
lines changed

12 files changed

+4416
-23
lines changed

.github/workflows/test.yaml

+6-10
Original file line numberDiff line numberDiff line change
@@ -46,19 +46,15 @@ jobs:
4646
uses: actions/setup-python@v4
4747
with:
4848
python-version: ${{ matrix.python }}
49-
cache: "pip"
50-
cache-dependency-path: "**/pyproject.toml"
5149
- uses: r-lib/actions/setup-r@v2
5250
with:
5351
r-version: ${{ matrix.r }}
5452
use-public-rspm: true
53+
- name: Install uv
54+
uses: astral-sh/setup-uv@v5
5555

56-
- name: Install test dependencies
57-
run: |
58-
python -m pip install --upgrade pip wheel
59-
- name: Install dependencies
60-
run: |
61-
pip install ".[dev,test,copykat]"
56+
- name: Install the project
57+
run: uv sync --extra dev --extra test --extra gtf --extra copykat
6258

6359
- name: Install R dependencies
6460
run: |
@@ -73,9 +69,9 @@ jobs:
7369
PLATFORM: ${{ matrix.os }}
7470
DISPLAY: :42
7571
run: |
76-
coverage run -m pytest -v --color=yes
72+
uv run coverage run -m pytest -v --color=yes
7773
- name: Report coverage
7874
run: |
79-
coverage report
75+
uv run coverage report
8076
- name: Upload coverage
8177
uses: codecov/codecov-action@v3

.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
.DS_Store
33
*~
44
buck-out/
5+
.pybiomart.sqlite
56

67
# Compiled files
78
.venv/

README.md

+2-2
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
[![Documentation][badge-docs]][link-docs]
55
[![PyPI][badge-pypi]][link-pypi]
66

7-
[badge-tests]: https://img.shields.io/github/actions/workflow/status/icbi-lab/infercnvpy/test.yaml?branch=main
7+
[badge-tests]: https://github.com/icbi-lab/infercnvpy/actions/workflows/test.yaml/badge.svg
88
[link-tests]: https://github.com/icbi-lab/infercnvpy/actions/workflows/test.yml
99
[badge-docs]: https://img.shields.io/readthedocs/infercnvpy
1010
[badge-pypi]: https://img.shields.io/pypi/v/infercnvpy?logo=PyPI
@@ -80,5 +80,5 @@ n/a
8080
[scverse-discourse]: https://discourse.scverse.org/
8181
[issue-tracker]: https://github.com/icbi-lab/infercnvpy/issues
8282
[changelog]: https://infercnvpy.readthedocs.io/latest/changelog.html
83-
[link-docs]: https://infercnvpy.readthedocs.io
83+
[link-docs]: https://infercnvpy.readthedocs.io/
8484
[link-api]: https://infercnvpy.readthedocs.io/en/latest/api.html

docs/api.rst

+1
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ Input/Output: `io`
2121
.. autosummary::
2222
:toctree: ./generated
2323

24+
genomic_position_from_biomart
2425
genomic_position_from_gtf
2526
read_scevan
2627

docs/notebooks/tutorial_3k.ipynb

+10-6
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,10 @@
8282
"cell_type": "raw",
8383
"id": "respected-outreach",
8484
"metadata": {
85-
"raw_mimetype": "text/restructuredtext"
85+
"raw_mimetype": "text/restructuredtext",
86+
"vscode": {
87+
"languageId": "raw"
88+
}
8689
},
8790
"source": [
8891
".. note::\n",
@@ -98,8 +101,9 @@
98101
" the start and end positions on that chromosome for each gene, \n",
99102
" respectively. \n",
100103
" \n",
101-
" Infercnvpy provides the :func:`infercnvpy.io.genomic_position_from_gtf` function\n",
102-
" to read these information from a GTF file and add them to `adata.var`. \n",
104+
" Infercnvpy provides the :func:`infercnvpy.io.genomic_position_from_biomart` and \n",
105+
" :func:`infercnvpy.io.genomic_position_from_gtf` functions\n",
106+
" to get these information online or from a GTF file and store them in `adata.var`. \n",
103107
" \n",
104108
"The example dataset is already appropriately preprocessed. "
105109
]
@@ -1448,9 +1452,9 @@
14481452
"notebook_metadata_filter": "-kernelspec"
14491453
},
14501454
"kernelspec": {
1451-
"display_name": "Python [conda env:micromamba-infercnvpy]",
1455+
"display_name": ".venv",
14521456
"language": "python",
1453-
"name": "conda-env-micromamba-infercnvpy-py"
1457+
"name": "python3"
14541458
},
14551459
"language_info": {
14561460
"codemirror_mode": {
@@ -1462,7 +1466,7 @@
14621466
"name": "python",
14631467
"nbconvert_exporter": "python",
14641468
"pygments_lexer": "ipython3",
1465-
"version": "3.10.12"
1469+
"version": "3.11.10"
14661470
}
14671471
},
14681472
"nbformat": 4,

pyproject.toml

+10-2
Original file line numberDiff line numberDiff line change
@@ -24,21 +24,24 @@ urls.Source = "https://github.com/icbi-lab/infercnvpy"
2424
urls.Home-page = "https://github.com/icbi-lab/infercnvpy"
2525
dependencies = [
2626
'anndata>=0.7.3',
27-
"scanpy>=1.9",
27+
"scanpy>=1.10",
2828
'pandas>=1',
2929
'numpy>=1.20', # includes type annotations
3030
'tqdm>=4.63.0', # fixes tqdm.auto
3131
'pytoml',
32-
'gtfparse>=2.1',
3332
'pycairo>=1.20; sys_platform == "win32"',
3433
'leidenalg',
3534
'pyreadr',
3635
'pytest-benchmark',
3736
# for debug logging (referenced from the issue template)
3837
"session-info",
38+
"pybiomart>=0.2.0",
3939
]
4040

4141
[project.optional-dependencies]
42+
gtf = [
43+
'gtfparse>=2.1'
44+
]
4245
copykat = [
4346
'rpy2'
4447
]
@@ -60,10 +63,12 @@ doc = [
6063
'pycairo',
6164
'jupyter_client',
6265
"pandas",
66+
"setuptools", # required for sphinxcontrib-bibtex
6367
]
6468
test = [
6569
"pytest",
6670
"coverage",
71+
"openpyxl", # required for one of the scanpy datasets used in the tests
6772
]
6873

6974
[tool.hatch.version]
@@ -155,3 +160,6 @@ skip = [
155160
"docs/references.md",
156161
"docs/notebooks/example.ipynb",
157162
]
163+
164+
[tool.uv.sources]
165+
gtfparse = { git = "https://github.com/lrauschning/gtfparse.git", rev = "dev" }

src/infercnvpy/io/__init__.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,4 @@
1-
from ._genepos import genomic_position_from_gtf
1+
from ._genepos import genomic_position_from_biomart, genomic_position_from_gtf
22
from ._scevan import read_scevan
3+
4+
__all__ = ["genomic_position_from_gtf", "genomic_position_from_biomart", "read_scevan"]

src/infercnvpy/io/_genepos.py

+98-2
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,96 @@
11
from pathlib import Path
22
from typing import Literal
33

4-
import gtfparse
54
import numpy as np
65
import pandas as pd
6+
import scanpy.queries
77
from anndata import AnnData
88
from scanpy import logging
99

1010

11+
def genomic_position_from_biomart(
12+
adata: AnnData | None = None,
13+
*,
14+
adata_gene_id: str | None = None,
15+
biomart_gene_id="ensembl_gene_id",
16+
species: str = "hsapiens",
17+
inplace: bool = True,
18+
**kwargs,
19+
):
20+
"""
21+
Get genomic gene positions from ENSEMBL Biomart.
22+
23+
Parameters
24+
----------
25+
adata
26+
Adds the genomic positions to `adata.var`. If adata is None, returns
27+
a data frame with the genomic positions instead.
28+
adata_gene_id
29+
Column in `adata.var` that contains (ENSMBL) gene IDs. If not specified,
30+
use `adata.var_names`.
31+
biomart_gene_id
32+
The biomart column to use as gene identifier. Typically this would be `ensembl_gene_id` or `hgnc_symbol`,
33+
but could be different for other species.
34+
inplace
35+
If True, add the annotations directly to adata, otherwise return a dataframe.
36+
**kwargs
37+
passed on to :func:`scanpy.queries.biomart_annotations`
38+
"""
39+
biomart_annot = (
40+
scanpy.queries.biomart_annotations(
41+
species,
42+
[
43+
biomart_gene_id,
44+
"start_position",
45+
"end_position",
46+
"chromosome_name",
47+
],
48+
**kwargs,
49+
)
50+
.rename(
51+
columns={
52+
"start_position": "start",
53+
"end_position": "end",
54+
"chromosome_name": "chromosome",
55+
}
56+
)
57+
# use chr prefix for chromosome
58+
.assign(chromosome=lambda x: "chr" + x["chromosome"])
59+
)
60+
61+
gene_ids_adata = (adata.var_names if adata_gene_id is None else adata.var[adata_gene_id]).values
62+
missing_from_biomart = len(set(gene_ids_adata) - set(biomart_annot[biomart_gene_id].values))
63+
if missing_from_biomart:
64+
logging.warning(
65+
f"Biomart misses annotation for {missing_from_biomart} genes in adata. Did you use ENSEMBL ids?"
66+
)
67+
68+
duplicated_symbols = np.sum(biomart_annot[biomart_gene_id].duplicated())
69+
if duplicated_symbols:
70+
logging.warning(f"Skipped {duplicated_symbols} genes because of duplicate identifiers in GTF file.")
71+
biomart_annot = biomart_annot.loc[~biomart_annot[biomart_gene_id].duplicated(keep=False), :]
72+
73+
tmp_var = adata.var.copy()
74+
orig_index_name = tmp_var.index.name
75+
TMP_INDEX_NAME = "adata_var_index"
76+
tmp_var.index.name = TMP_INDEX_NAME
77+
tmp_var.reset_index(inplace=True)
78+
var_annotated = tmp_var.merge(
79+
biomart_annot,
80+
how="left",
81+
left_on=TMP_INDEX_NAME if adata_gene_id is None else adata_gene_id,
82+
right_on=biomart_gene_id,
83+
validate="one_to_one",
84+
)
85+
var_annotated.set_index(TMP_INDEX_NAME, inplace=True)
86+
var_annotated.index.name = orig_index_name
87+
88+
if inplace:
89+
adata.var = var_annotated
90+
else:
91+
return var_annotated
92+
93+
1194
def genomic_position_from_gtf(
1295
gtf_file: Path | str,
1396
adata: AnnData | None = None,
@@ -16,7 +99,8 @@ def genomic_position_from_gtf(
1699
adata_gene_id: str | None = None,
17100
inplace: bool = True,
18101
) -> pd.DataFrame | None:
19-
"""Get genomic gene positions from a GTF file.
102+
"""
103+
Get genomic gene positions from a GTF file.
20104
21105
The GTF file needs to match the genome annotation used for your single cell dataset.
22106
@@ -38,6 +122,12 @@ def genomic_position_from_gtf(
38122
inplace
39123
If True, add the annotations directly to adata, otherwise return a dataframe.
40124
"""
125+
try:
126+
import gtfparse
127+
except ImportError:
128+
raise ImportError(
129+
"genomic_position_from_gtf requires gtfparse as optional dependency. Please install it using `pip install gtfparse`."
130+
) from None
41131
gtf = gtfparse.read_gtf(
42132
gtf_file, usecols=["seqname", "feature", "start", "end", "gene_id", "gene_name"]
43133
).to_pandas()
@@ -49,6 +139,8 @@ def genomic_position_from_gtf(
49139
.drop_duplicates()
50140
.rename(columns={"seqname": "chromosome"})
51141
)
142+
# remove ensembl versions
143+
gtf["gene_id"] = gtf["gene_id"].str.replace(r"\.\d+$", "", regex=True)
52144

53145
gene_ids_adata = (adata.var_names if adata_gene_id is None else adata.var[adata_gene_id]).values
54146
gtf = gtf.loc[gtf[gtf_gene_id].isin(gene_ids_adata), :]
@@ -77,6 +169,10 @@ def genomic_position_from_gtf(
77169
var_annotated.set_index(TMP_INDEX_NAME, inplace=True)
78170
var_annotated.index.name = orig_index_name
79171

172+
# if not a gencode GTF, let's add 'chr' prefix:
173+
if np.all(~var_annotated["chromosome"].dropna().str.startswith("chr")):
174+
var_annotated["chromosome"] = "chr" + var_annotated["chromosome"]
175+
80176
if inplace:
81177
adata.var = var_annotated
82178
else:

tests/conftest.py

+7
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
from pathlib import Path
2+
13
import numpy as np
24
import pandas as pd
35
import pytest
@@ -7,6 +9,11 @@
79
import infercnvpy as cnv
810

911

12+
@pytest.fixture()
13+
def testdata():
14+
return Path(__file__).parent / "data"
15+
16+
1017
@pytest.fixture(params=[np.array, sp.csr_matrix, sp.csc_matrix])
1118
def adata_oligodendroma(request):
1219
"""Adata with raw counts in .X parametrized to be either sparse or dense."""

0 commit comments

Comments
 (0)