anndataR/inst/scripts/example_h5ad.py at 8a7c541a5e0e3275222f01d8e5048c2022d3b4d6 · scverse/anndataR · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
# python v3.13.5
import anndata  # anndata v0.11.4
import scanpy  # scanpy v1.11.4
import numpy  # numpy v2.2.6
import pandas  # pandas v2.3.0
import scipy.sparse  # scipy v1.14.1

# This script uses Python to create an example H5AD file for testing
# interoperability between languages. It is designed to be a small but
# relatively complex file that tests reading of different types and data
# structures. The standard scanpy workflow has also been applied to populate
# some of the most common information from real analyses. It should be updated
# to test new issues as they are discovered.
#
# NOTE: When updating this script for the {anndataR} example H5AD file please
# update the package versions used above, update the script version, date and
# changelog below and format the file using Python Black
# (https://black.readthedocs.io/en/stable/).
#
# Version: 0.2.0
# Date: 2023-05-11
#
# CHANGELOG
#
# v0.3.0 (2025-08-04)
# - Add adata.varp["test_varp"] to test reading of varp
# - Update package versions to latest stable versions
# v0.2.0 (2023-05-11)
# - Add 1D sparse matrix to `adata.uns["Sparse1D"]
# - Reduce the size of `adata.uns["String2D"]` and add columns to values
# v0.1.1 (2023-05-09)
# - Reduce the size of `adata.uns["String2D"]` to save space
# - Reduce dimension to 50 x 100 to save space
# v0.1.0 (2023-05-08)
# - Initial version

numpy.random.seed(0)

# Randomly generate a counts matrix
counts = numpy.random.poisson(2, size=(50, 100))

# Create an AnnData
adata = anndata.AnnData(scipy.sparse.csr_matrix(counts.copy(), dtype=numpy.float32))
adata.obs_names = [f"Cell{i:03d}" for i in range(adata.n_obs)]
adata.var_names = [f"Gene{i:03d}" for i in range(adata.n_vars)]

# Populate layers with different matrix types
adata.layers["counts"] = adata.X.copy()
adata.layers["dense_counts"] = counts.copy()
adata.layers["csc_counts"] = scipy.sparse.csc_matrix(counts.copy(), dtype=numpy.float32)

# Populate adata.var with different types
adata.var["String"] = [f"String{i}" for i in range(adata.n_vars)]

# Populate adata.obs with different types
adata.obs["Float"] = 42.42
adata.obs["FloatNA"] = adata.obs["Float"]
adata.obs["FloatNA"][0] = float("nan")
adata.obs["Int"] = numpy.arange(adata.n_obs)
adata.obs["IntNA"] = pandas.array([None] + [42] * (adata.n_obs - 1))
adata.obs["Bool"] = pandas.array([False] + [True] * (adata.n_obs - 1))
adata.obs["BoolNA"] = pandas.array([False, None] + [True] * (adata.n_obs - 2))

# Populate adata.uns with different types
adata.uns["Category"] = pandas.array(["a", "b", None], dtype="category")
adata.uns["Bool"] = [True, True, False]
adata.uns["BoolNA"] = pandas.array([True, False, None])
adata.uns["Int"] = [1, 2, 3]
adata.uns["IntNA"] = pandas.array([1, 2, None])
adata.uns["IntScalar"] = 1
adata.uns["Sparse1D"] = scipy.sparse.csc_matrix([1, 2, 0, 0, 0, 3])
adata.uns["StringScalar"] = "A string"
adata.uns["String"] = [f"String {i}" for i in range(10)]
adata.uns["String2D"] = [[f"row{i}col{j}" for i in range(10)] for j in range(5)]
adata.uns["DataFrameEmpty"] = pandas.DataFrame(index=adata.obs.index)

# Run the standard scanpy workflow
scanpy.pp.calculate_qc_metrics(adata, percent_top=None, inplace=True)
scanpy.pp.normalize_total(adata, inplace=True)
adata.layers["dense_X"] = adata.X.copy().toarray()
scanpy.pp.log1p(adata)
scanpy.pp.highly_variable_genes(adata)
scanpy.tl.pca(adata)
scanpy.pp.neighbors(adata)
scanpy.tl.umap(adata)
scanpy.tl.leiden(adata)
scanpy.tl.rank_genes_groups(adata, "leiden")

# add varp to test reading of varp
adata.varp["test_varp"] = numpy.random.rand(adata.n_vars, adata.n_vars)

# Write the H5AD file
adata.write_h5ad("inst/extdata/example.h5ad", compression="gzip")