1
1
from pathlib import Path
2
2
from typing import Literal
3
3
4
- import gtfparse
5
4
import numpy as np
6
5
import pandas as pd
6
+ import scanpy .queries
7
7
from anndata import AnnData
8
8
from scanpy import logging
9
9
10
10
11
+ def genomic_position_from_biomart (
12
+ adata : AnnData | None = None ,
13
+ * ,
14
+ adata_gene_id : str | None = None ,
15
+ biomart_gene_id = "ensembl_gene_id" ,
16
+ species : str = "hsapiens" ,
17
+ inplace : bool = True ,
18
+ ** kwargs ,
19
+ ):
20
+ """
21
+ Get genomic gene positions from ENSEMBL Biomart.
22
+
23
+ Parameters
24
+ ----------
25
+ adata
26
+ Adds the genomic positions to `adata.var`. If adata is None, returns
27
+ a data frame with the genomic positions instead.
28
+ adata_gene_id
29
+ Column in `adata.var` that contains (ENSMBL) gene IDs. If not specified,
30
+ use `adata.var_names`.
31
+ biomart_gene_id
32
+ The biomart column to use as gene identifier. Typically this would be `ensembl_gene_id` or `hgnc_symbol`,
33
+ but could be different for other species.
34
+ inplace
35
+ If True, add the annotations directly to adata, otherwise return a dataframe.
36
+ **kwargs
37
+ passed on to :func:`scanpy.queries.biomart_annotations`
38
+ """
39
+ biomart_annot = (
40
+ scanpy .queries .biomart_annotations (
41
+ species ,
42
+ [
43
+ biomart_gene_id ,
44
+ "start_position" ,
45
+ "end_position" ,
46
+ "chromosome_name" ,
47
+ ],
48
+ ** kwargs ,
49
+ )
50
+ .rename (
51
+ columns = {
52
+ "start_position" : "start" ,
53
+ "end_position" : "end" ,
54
+ "chromosome_name" : "chromosome" ,
55
+ }
56
+ )
57
+ # use chr prefix for chromosome
58
+ .assign (chromosome = lambda x : "chr" + x ["chromosome" ])
59
+ )
60
+
61
+ gene_ids_adata = (adata .var_names if adata_gene_id is None else adata .var [adata_gene_id ]).values
62
+ missing_from_biomart = len (set (gene_ids_adata ) - set (biomart_annot [biomart_gene_id ].values ))
63
+ if missing_from_biomart :
64
+ logging .warning (
65
+ f"Biomart misses annotation for { missing_from_biomart } genes in adata. Did you use ENSEMBL ids?"
66
+ )
67
+
68
+ duplicated_symbols = np .sum (biomart_annot [biomart_gene_id ].duplicated ())
69
+ if duplicated_symbols :
70
+ logging .warning (f"Skipped { duplicated_symbols } genes because of duplicate identifiers in GTF file." )
71
+ biomart_annot = biomart_annot .loc [~ biomart_annot [biomart_gene_id ].duplicated (keep = False ), :]
72
+
73
+ tmp_var = adata .var .copy ()
74
+ orig_index_name = tmp_var .index .name
75
+ TMP_INDEX_NAME = "adata_var_index"
76
+ tmp_var .index .name = TMP_INDEX_NAME
77
+ tmp_var .reset_index (inplace = True )
78
+ var_annotated = tmp_var .merge (
79
+ biomart_annot ,
80
+ how = "left" ,
81
+ left_on = TMP_INDEX_NAME if adata_gene_id is None else adata_gene_id ,
82
+ right_on = biomart_gene_id ,
83
+ validate = "one_to_one" ,
84
+ )
85
+ var_annotated .set_index (TMP_INDEX_NAME , inplace = True )
86
+ var_annotated .index .name = orig_index_name
87
+
88
+ if inplace :
89
+ adata .var = var_annotated
90
+ else :
91
+ return var_annotated
92
+
93
+
11
94
def genomic_position_from_gtf (
12
95
gtf_file : Path | str ,
13
96
adata : AnnData | None = None ,
@@ -16,7 +99,8 @@ def genomic_position_from_gtf(
16
99
adata_gene_id : str | None = None ,
17
100
inplace : bool = True ,
18
101
) -> pd .DataFrame | None :
19
- """Get genomic gene positions from a GTF file.
102
+ """
103
+ Get genomic gene positions from a GTF file.
20
104
21
105
The GTF file needs to match the genome annotation used for your single cell dataset.
22
106
@@ -38,6 +122,12 @@ def genomic_position_from_gtf(
38
122
inplace
39
123
If True, add the annotations directly to adata, otherwise return a dataframe.
40
124
"""
125
+ try :
126
+ import gtfparse
127
+ except ImportError :
128
+ raise ImportError (
129
+ "genomic_position_from_gtf requires gtfparse as optional dependency. Please install it using `pip install gtfparse`."
130
+ ) from None
41
131
gtf = gtfparse .read_gtf (
42
132
gtf_file , usecols = ["seqname" , "feature" , "start" , "end" , "gene_id" , "gene_name" ]
43
133
).to_pandas ()
@@ -49,6 +139,8 @@ def genomic_position_from_gtf(
49
139
.drop_duplicates ()
50
140
.rename (columns = {"seqname" : "chromosome" })
51
141
)
142
+ # remove ensembl versions
143
+ gtf ["gene_id" ] = gtf ["gene_id" ].str .replace (r"\.\d+$" , "" , regex = True )
52
144
53
145
gene_ids_adata = (adata .var_names if adata_gene_id is None else adata .var [adata_gene_id ]).values
54
146
gtf = gtf .loc [gtf [gtf_gene_id ].isin (gene_ids_adata ), :]
@@ -77,6 +169,10 @@ def genomic_position_from_gtf(
77
169
var_annotated .set_index (TMP_INDEX_NAME , inplace = True )
78
170
var_annotated .index .name = orig_index_name
79
171
172
+ # if not a gencode GTF, let's add 'chr' prefix:
173
+ if np .all (~ var_annotated ["chromosome" ].dropna ().str .startswith ("chr" )):
174
+ var_annotated ["chromosome" ] = "chr" + var_annotated ["chromosome" ]
175
+
80
176
if inplace :
81
177
adata .var = var_annotated
82
178
else :
0 commit comments