Skip to content

Commit 049f00d

Browse files
Add census_data and census_spatial collections
1 parent 28ac990 commit 049f00d

File tree

6 files changed

+984
-304
lines changed

6 files changed

+984
-304
lines changed

tools/cellxgene_census_builder/spatial_dev_tools/census_spatial_dataset_ingest.ipynb

+888-275
Large diffs are not rendered by default.

tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/build_soma.py

+10-2
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
from .globals import (
2626
CENSUS_DATA_NAME,
2727
CENSUS_INFO_NAME,
28+
CENSUS_SPATIAL_NAME,
2829
SOMA_TileDB_Context,
2930
)
3031
from .manifest import load_manifest
@@ -153,7 +154,7 @@ def populate_root_collection(root_collection: soma.Collection) -> soma.Collectio
153154
root_collection.metadata["git_commit_sha"] = sha
154155

155156
# Create sub-collections for experiments, etc.
156-
for n in [CENSUS_INFO_NAME, CENSUS_DATA_NAME]:
157+
for n in [CENSUS_INFO_NAME, CENSUS_DATA_NAME, CENSUS_SPATIAL_NAME]:
157158
root_collection.add_new_collection(n)
158159

159160
return root_collection
@@ -198,7 +199,14 @@ def build_step2_create_root_collection(soma_path: str, experiment_builders: list
198199
populate_root_collection(root_collection)
199200

200201
for e in experiment_builders:
201-
e.create(census_data=root_collection[CENSUS_DATA_NAME])
202+
# TODO (spatial): Confirm the decision that we are clearly separating
203+
# experiments containing spatial assays from experiments not containing
204+
# spatial assays. That is, an experiment should never contain assays from
205+
# spatial and non-spatial modalities
206+
if e.specification.is_exclusively_spatial():
207+
e.create(census_data=root_collection[CENSUS_SPATIAL_NAME])
208+
else:
209+
e.create(census_data=root_collection[CENSUS_DATA_NAME])
202210

203211
logger.info("Build step 2 - Create root collection - finished")
204212
return root_collection

tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/experiment_builder.py

+13-4
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
from .anndata import AnnDataFilterSpec, AnnDataProxy, open_anndata
2626
from .datasets import Dataset
2727
from .globals import (
28+
ALLOWED_SPATIAL_ASSAYS,
2829
CENSUS_OBS_PLATFORM_CONFIG,
2930
CENSUS_OBS_TABLE_SPEC,
3031
CENSUS_VAR_PLATFORM_CONFIG,
@@ -109,6 +110,10 @@ def create(
109110
"""Factory method. Do not instantiate the class directly."""
110111
return cls(name, label, anndata_cell_filter_spec, organism_ontology_term_id)
111112

113+
def is_exclusively_spatial(self) -> bool:
114+
"""Returns True if the experiment specification EXCLUSIVELY involves spatial assays."""
115+
return self.anndata_cell_filter_spec["assay_ontology_term_ids"] == ALLOWED_SPATIAL_ASSAYS
116+
112117

113118
class ExperimentBuilder:
114119
"""Class that embodies the operators and state to build an Experiment.
@@ -143,7 +148,7 @@ def anndata_cell_filter_spec(self) -> AnnDataFilterSpec:
143148
return self.specification.anndata_cell_filter_spec
144149

145150
def create(self, census_data: soma.Collection) -> None:
146-
"""Create experiment within the specified Collection with a single Measurement."""
151+
"""Create experiment within the specified Collection."""
147152
logger.info(f"{self.name}: create experiment at {urlcat(census_data.uri, self.name)}")
148153

149154
self.experiment = census_data.add_new_collection(self.name, soma.Experiment)
@@ -155,6 +160,10 @@ def create(self, census_data: soma.Collection) -> None:
155160
# make measurement and add to ms collection
156161
ms.add_new_collection(MEASUREMENT_RNA_NAME, soma.Measurement)
157162

163+
# create `spatial`
164+
if self.specification.is_exclusively_spatial():
165+
self.experiment.add_new_collection("spatial")
166+
158167
def write_obs_dataframe(self) -> None:
159168
logger.info(f"{self.name}: writing obs dataframe")
160169
assert self.experiment is not None
@@ -661,7 +670,7 @@ def read_and_dispatch_partial_h5ad(
661670
if d.dataset_id in eb.dataset_obs_joinid_start
662671
for chunk in range(0, eb.dataset_n_obs[d.dataset_id], REDUCE_X_MAJOR_ROW_STRIDE)
663672
]
664-
per_eb_results[eb.name] = (
673+
per_eb_results[eb.experiment_uri] = (
665674
dask.bag.from_sequence(read_file_chunks)
666675
.starmap(read_and_dispatch_partial_h5ad, global_var_joinids=global_var_joinids)
667676
.foldby("dataset_id", reduce_X_stats_binop)
@@ -685,12 +694,12 @@ def populate_X_layers(
685694
per_eb_results = _reduce_X_matrices(assets_path, datasets, experiment_builders)
686695

687696
for eb in experiment_builders:
688-
if eb.name not in per_eb_results:
697+
if eb.experiment_uri not in per_eb_results:
689698
continue
690699

691700
# add per-dataset stats to each per-dataset XReduction
692701
eb_result: list[XReduction] = []
693-
for dataset_id, xreduction in per_eb_results[eb.name]:
702+
for dataset_id, xreduction in per_eb_results[eb.experiment_uri]:
694703
assert dataset_id == xreduction["dataset_id"]
695704
d = datasets_by_id[dataset_id]
696705
eb_result.extend(

tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/experiment_specs.py

+20-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import functools
22

33
from .experiment_builder import ExperimentBuilder, ExperimentSpecification
4-
from .globals import RNA_SEQ
4+
from .globals import ALLOWED_SPATIAL_ASSAYS, RNA_SEQ
55

66

77
@functools.cache
@@ -30,6 +30,25 @@ def make_experiment_specs() -> list[ExperimentSpecification]:
3030
},
3131
organism_ontology_term_id="NCBITaxon:10090",
3232
),
33+
# Experiments for spatial assays
34+
ExperimentSpecification.create(
35+
name="homo_sapiens",
36+
label="Homo sapiens",
37+
anndata_cell_filter_spec={
38+
"organism_ontology_term_id": "NCBITaxon:9606",
39+
"assay_ontology_term_ids": ALLOWED_SPATIAL_ASSAYS,
40+
},
41+
organism_ontology_term_id="NCBITaxon:9606",
42+
),
43+
ExperimentSpecification.create(
44+
name="mus_musculus",
45+
label="Mus musculus",
46+
anndata_cell_filter_spec={
47+
"organism_ontology_term_id": "NCBITaxon:10090",
48+
"assay_ontology_term_ids": ALLOWED_SPATIAL_ASSAYS,
49+
},
50+
organism_ontology_term_id="NCBITaxon:10090",
51+
),
3352
]
3453

3554

tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/globals.py

+9-1
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,9 @@
5252
# top-level SOMA collection
5353
CENSUS_DATA_NAME = "census_data"
5454

55+
# top-level SOMA collection
56+
CENSUS_SPATIAL_NAME = "census_spatial"
57+
5558
# "census_info"/"summary_cell_counts" SOMA Dataframe
5659
CENSUS_SUMMARY_CELL_COUNTS_NAME = "summary_cell_counts" # object name
5760

@@ -329,7 +332,6 @@
329332
"EFO:0010713", # 10x immune profiling
330333
"EFO:0010714", # 10x TCR enrichment
331334
"EFO:0010715", # 10x Ig enrichment
332-
"EFO:0010961", # Visium Spatial Gene Expression
333335
"EFO:0010964", # barcoded plate-based single cell RNA-seq
334336
"EFO:0011025", # 10x 5' v1
335337
"EFO:0022396", # TruSeq
@@ -353,6 +355,12 @@
353355
"EFO:0700016", # Smart-seq v4
354356
]
355357

358+
# list of EFO terms that correspond to SPATIAL modality/measurement. These terms
359+
# define the inclusive filter applied to obs.assay_ontology_term_id. All other
360+
ALLOWED_SPATIAL_ASSAYS = [
361+
"EFO:0010961", # Visium Spatial Gene Expression
362+
]
363+
356364
# Full-gene assays have special handling in the "normalized" X layers
357365
FULL_GENE_ASSAY = [
358366
"EFO:0003755", # FL-cDNA

tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/validate_soma.py

+44-21
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
CENSUS_OBS_STATS_COLUMNS,
3939
CENSUS_OBS_TABLE_SPEC,
4040
CENSUS_SCHEMA_VERSION,
41+
CENSUS_SPATIAL_NAME,
4142
CENSUS_SUMMARY_CELL_COUNTS_NAME,
4243
CENSUS_SUMMARY_CELL_COUNTS_TABLE_SPEC,
4344
CENSUS_SUMMARY_NAME,
@@ -82,9 +83,18 @@ def assert_all(__iterable: Iterable[object]) -> bool:
8283
return r
8384

8485

86+
def get_census_data_collection_name(eb: ExperimentSpecification) -> str:
87+
return CENSUS_SPATIAL_NAME if eb.is_exclusively_spatial() else CENSUS_DATA_NAME
88+
89+
90+
def get_experiment_uri(base_uri: str, eb: ExperimentSpecification) -> str:
91+
census_data_collection_name = get_census_data_collection_name(eb)
92+
return urlcat(base_uri, census_data_collection_name, eb.name)
93+
94+
8595
def open_experiment(base_uri: str, eb: ExperimentSpecification) -> soma.Experiment:
8696
"""Helper function that knows the Census schema path conventions."""
87-
return soma.Experiment.open(urlcat(base_uri, CENSUS_DATA_NAME, eb.name), mode="r")
97+
return soma.Experiment.open(get_experiment_uri(base_uri, eb), mode="r")
8898

8999

90100
def get_experiment_shape(base_uri: str, specs: list[ExperimentSpecification]) -> dict[str, tuple[int, int]]:
@@ -230,9 +240,10 @@ def validate_axis_dataframes_global_ids(
230240
.concat()
231241
.to_pandas()
232242
)
233-
assert eb_info[eb.name].n_obs == len(census_obs_df) == exp.obs.count
234-
assert (len(census_obs_df) == 0) or (census_obs_df.soma_joinid.max() + 1 == eb_info[eb.name].n_obs)
235-
assert eb_info[eb.name].dataset_ids == set(census_obs_df.dataset_id.unique())
243+
eb_info_key = get_experiment_uri(soma_path, eb)
244+
assert eb_info[eb_info_key].n_obs == len(census_obs_df) == exp.obs.count
245+
assert (len(census_obs_df) == 0) or (census_obs_df.soma_joinid.max() + 1 == eb_info[eb_info_key].n_obs)
246+
assert eb_info[eb_info_key].dataset_ids == set(census_obs_df.dataset_id.unique())
236247

237248
# Validate that all obs soma_joinids are unique and in the range [0, n).
238249
obs_unique_joinids = np.unique(census_obs_df.soma_joinid.to_numpy())
@@ -254,13 +265,13 @@ def validate_axis_dataframes_global_ids(
254265
del census_obs_df, obs_unique_joinids
255266

256267
# var
257-
n_vars = len(eb_info[eb.name].vars)
268+
n_vars = len(eb_info[eb_info_key].vars)
258269

259270
census_var_df = (
260271
exp.ms[MEASUREMENT_RNA_NAME].var.read(column_names=["feature_id", "soma_joinid"]).concat().to_pandas()
261272
)
262273
assert n_vars == len(census_var_df) == exp.ms[MEASUREMENT_RNA_NAME].var.count
263-
assert eb_info[eb.name].vars == set(census_var_df.feature_id.array)
274+
assert eb_info[eb_info_key].vars == set(census_var_df.feature_id.array)
264275
assert (len(census_var_df) == 0) or (census_var_df.soma_joinid.max() + 1 == n_vars)
265276

266277
# Validate that all var soma_joinids are unique and in the range [0, n).
@@ -289,7 +300,7 @@ def _validate_axis_dataframes(
289300
eb_info: dict[str, EbInfo] = {}
290301
for eb in experiment_specifications:
291302
with soma.Collection.open(soma_path, context=SOMA_TileDB_Context()) as census:
292-
census_data = census[CENSUS_DATA_NAME]
303+
census_data_collection = census[get_census_data_collection_name(eb)]
293304
dataset_id = dataset.dataset_id
294305
ad = open_anndata(
295306
dataset,
@@ -298,8 +309,16 @@ def _validate_axis_dataframes(
298309
var_column_names=CXG_VAR_COLUMNS_READ,
299310
filter_spec=eb.anndata_cell_filter_spec,
300311
)
301-
eb_info[eb.name] = EbInfo()
302-
se = census_data[eb.name]
312+
se = census_data_collection[eb.name]
313+
314+
# NOTE: Since we are validating data for each experiment, we
315+
# use the experiment uri as the key for the data that must be validated.
316+
# Using just the experiment spec name would cause collisions as in the case
317+
# of spatial and non-spatial experiments with the same name (experiment spec name)
318+
# but stored under different census root collections
319+
eb_info_key = get_experiment_uri(soma_path, eb)
320+
eb_info[eb_info_key] = EbInfo()
321+
303322
dataset_obs = (
304323
se.obs.read(
305324
column_names=list(CENSUS_OBS_TABLE_SPEC.field_names()),
@@ -326,11 +345,13 @@ def _validate_axis_dataframes(
326345
if isinstance(dataset_obs[key].dtype, pd.CategoricalDtype):
327346
dataset_obs[key] = dataset_obs[key].astype(dataset_obs[key].cat.categories.dtype)
328347

329-
assert len(dataset_obs) == len(ad.obs), f"{dataset.dataset_id}/{eb.name} obs length mismatch"
348+
assert (
349+
len(dataset_obs) == len(ad.obs)
350+
), f"{dataset.dataset_id}/{eb.name} obs length mismatch soma experiment obs len: {len(dataset_obs)} != anndata obs len: {len(ad.obs)}"
330351
if ad.n_obs > 0:
331-
eb_info[eb.name].n_obs += ad.n_obs
332-
eb_info[eb.name].dataset_ids.add(dataset_id)
333-
eb_info[eb.name].vars |= set(ad.var.index.array)
352+
eb_info[eb_info_key].n_obs += ad.n_obs
353+
eb_info[eb_info_key].dataset_ids.add(dataset_id)
354+
eb_info[eb_info_key].vars |= set(ad.var.index.array)
334355
ad_obs = ad.obs[list(set(CXG_OBS_TERM_COLUMNS) - set(CENSUS_OBS_STATS_COLUMNS))].reset_index(
335356
drop=True
336357
)
@@ -343,11 +364,11 @@ def _validate_axis_dataframes(
343364
def reduce_eb_info(results: Sequence[dict[str, EbInfo]]) -> dict[str, EbInfo]:
344365
eb_info = {}
345366
for res in results:
346-
for name, info in res.items():
347-
if name not in eb_info:
348-
eb_info[name] = copy.copy(info)
367+
for eb_info_key, info in res.items():
368+
if eb_info_key not in eb_info:
369+
eb_info[eb_info_key] = copy.copy(info)
349370
else:
350-
eb_info[name].update(info)
371+
eb_info[eb_info_key].update(info)
351372
return eb_info
352373

353374
eb_info = (
@@ -815,8 +836,9 @@ def validate_X_layers_schema(
815836
with open_experiment(soma_path, eb) as exp:
816837
assert soma.Collection.exists(exp.ms[MEASUREMENT_RNA_NAME].X.uri)
817838

818-
n_obs = eb_info[eb.name].n_obs
819-
n_vars = eb_info[eb.name].n_vars
839+
eb_info_key = get_experiment_uri(soma_path, eb)
840+
n_obs = eb_info[eb_info_key].n_obs
841+
n_vars = eb_info[eb_info_key].n_vars
820842
assert n_obs == exp.obs.count
821843
assert n_vars == exp.ms[MEASUREMENT_RNA_NAME].var.count
822844

@@ -1011,8 +1033,9 @@ def get_sparse_arrays(C: soma.Collection) -> list[soma.SparseNDArray]:
10111033
# first, confirm we set shape correctly, as the code uses it as the max bounding box
10121034
for eb in experiment_specifications:
10131035
with open_experiment(soma_path, eb) as exp:
1014-
n_obs = eb_info[eb.name].n_obs
1015-
n_vars = eb_info[eb.name].n_vars
1036+
eb_info_key = get_experiment_uri(soma_path, eb)
1037+
n_obs = eb_info[eb_info_key].n_obs
1038+
n_vars = eb_info[eb_info_key].n_vars
10161039
for layer_name in exp.ms[MEASUREMENT_RNA_NAME].X:
10171040
assert exp.ms[MEASUREMENT_RNA_NAME].X[layer_name].shape == (n_obs, n_vars)
10181041
if "feature_dataset_presence_matrix" in exp.ms[MEASUREMENT_RNA_NAME]:

0 commit comments

Comments
 (0)