Skip to content

Commit 3cf6391

Browse files
authored
fix: allow read_10x_mtx to read numeric gene IDs (#3932)
1 parent b103908 commit 3cf6391

6 files changed

Lines changed: 58 additions & 9 deletions

File tree

docs/release-notes/3932.fix.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Allow {func}`scanpy.read_10x_mtx` to read numeric gene IDs {smaller}`P Angerer`

src/scanpy/readwrite.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -628,21 +628,21 @@ def _read_10x_mtx(
628628
sep="\t",
629629
)
630630
if var_names == "gene_symbols":
631-
var_names_idx = pd.Index(genes[1].values)
631+
var_names_idx = pd.Index(genes[1].array)
632632
if make_unique:
633633
var_names_idx = anndata.utils.make_index_unique(var_names_idx)
634-
adata.var_names = var_names_idx
635-
adata.var["gene_ids"] = genes[0].values
634+
adata.var_names = var_names_idx.astype("str")
635+
adata.var["gene_ids"] = genes[0].array
636636
elif var_names == "gene_ids":
637-
adata.var_names = genes[0].values
638-
adata.var["gene_symbols"] = genes[1].values
637+
adata.var_names = genes[0].array.astype("str")
638+
adata.var["gene_symbols"] = genes[1].array
639639
else:
640640
msg = "`var_names` needs to be 'gene_symbols' or 'gene_ids'"
641641
raise ValueError(msg)
642642
if not is_legacy:
643-
adata.var["feature_types"] = genes[2].values
643+
adata.var["feature_types"] = genes[2].array
644644
barcodes = pd.read_csv(path / f"{prefix}barcodes.tsv{suffix}", header=None)
645-
adata.obs_names = barcodes[0].values
645+
adata.obs_names = barcodes[0].array
646646
return adata
647647

648648

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
AAACCCAAGATTAGCA-1
2+
AAACCCACACAATGAA-1
3+
AAACCCATCGGACCAC-1
4+
AAACCCATCTCGTTTA-1
5+
AAACGAAAGCAAGCCA-1
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
574405 Pwwp4b Gene Expression
2+
574404 Pwwp4a Gene Expression
3+
574403 Insyn2b Gene Expression
4+
574402 Gpr17 Gene Expression
5+
100862323 Btbd35f22 Gene Expression
6+
100862329 Btbd35f21 Gene Expression
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
%%MatrixMarket matrix coordinate real general
2+
%
3+
6 5 3
4+
4 3 6
5+
5 3 1
6+
6 3 1

tests/test_read_10x.py

Lines changed: 33 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,20 @@
22

33
import shutil
44
from pathlib import Path
5+
from typing import TYPE_CHECKING
56
from unittest.mock import patch
67

78
import h5py
89
import numpy as np
10+
import pandas as pd
911
import pytest
1012

1113
import scanpy as sc
1214

15+
if TYPE_CHECKING:
16+
from typing import Literal
17+
18+
1319
ROOT = Path(__file__).parent
1420
ROOT = ROOT / "_data" / "10x_data"
1521
VISIUM_ROOT = Path(__file__).parent / "_data" / "visium_data"
@@ -28,15 +34,19 @@ def assert_anndata_equal(a1, a2):
2834
pytest.param(
2935
ROOT / "1.2.0" / "filtered_gene_bc_matrices" / "hg19_chr21",
3036
ROOT / "1.2.0" / "filtered_gene_bc_matrices_h5.h5",
37+
id="1.2.0",
3138
),
3239
pytest.param(
3340
ROOT / "3.0.0" / "filtered_feature_bc_matrix",
3441
ROOT / "3.0.0" / "filtered_feature_bc_matrix.h5",
42+
id="3.0.0",
3543
),
3644
],
3745
)
38-
@pytest.mark.parametrize("prefix", [None, "prefix_"])
39-
def test_read_10x(tmp_path, mtx_path, h5_path, prefix):
46+
@pytest.mark.parametrize("prefix", [None, "prefix_"], ids=["no_prefix", "prefix"])
47+
def test_read_10x(
48+
tmp_path: Path, mtx_path: Path, h5_path: Path, prefix: str | None
49+
) -> None:
4050
if prefix is not None:
4151
# Build files named "prefix_XXX.xxx" in a temporary directory.
4252
mtx_path_orig = mtx_path
@@ -66,6 +76,27 @@ def test_read_10x(tmp_path, mtx_path, h5_path, prefix):
6676
assert_anndata_equal(sc.read_h5ad(from_mtx_pth), sc.read_h5ad(from_h5_pth))
6777

6878

79+
@pytest.mark.parametrize(
80+
("genes", "col_dtypes"),
81+
[
82+
pytest.param("symbols", dict(gene_ids="int64"), id="symbols"),
83+
pytest.param("ids", dict(gene_symbols="str"), id="ids"),
84+
],
85+
)
86+
def test_read_10x_mtx_int(
87+
genes: Literal["symbols", "ids"], col_dtypes: dict[str, str]
88+
) -> None:
89+
str_dt = "str" if pd.options.future.infer_string else "object"
90+
col_dtypes = {k: str_dt if v == "str" else v for k, v in col_dtypes.items()}
91+
92+
adata = sc.read_10x_mtx(
93+
ROOT / "int-ids", var_names=f"gene_{genes}", compressed=False
94+
)
95+
96+
assert adata.var.index.dtype == str_dt
97+
assert dict(adata.var.dtypes) == dict(feature_types=str_dt, **col_dtypes)
98+
99+
69100
def test_read_10x_h5_v1():
70101
spec_genome_v1 = sc.read_10x_h5(
71102
ROOT / "1.2.0" / "filtered_gene_bc_matrices_h5.h5",

0 commit comments

Comments
 (0)