38
38
CENSUS_OBS_STATS_COLUMNS ,
39
39
CENSUS_OBS_TABLE_SPEC ,
40
40
CENSUS_SCHEMA_VERSION ,
41
+ CENSUS_SPATIAL_NAME ,
41
42
CENSUS_SUMMARY_CELL_COUNTS_NAME ,
42
43
CENSUS_SUMMARY_CELL_COUNTS_TABLE_SPEC ,
43
44
CENSUS_SUMMARY_NAME ,
@@ -82,9 +83,18 @@ def assert_all(__iterable: Iterable[object]) -> bool:
82
83
return r
83
84
84
85
86
+ def get_census_data_collection_name (eb : ExperimentSpecification ) -> str :
87
+ return CENSUS_SPATIAL_NAME if eb .is_exclusively_spatial () else CENSUS_DATA_NAME
88
+
89
+
90
+ def get_experiment_uri (base_uri : str , eb : ExperimentSpecification ) -> str :
91
+ census_data_collection_name = get_census_data_collection_name (eb )
92
+ return urlcat (base_uri , census_data_collection_name , eb .name )
93
+
94
+
85
95
def open_experiment (base_uri : str , eb : ExperimentSpecification ) -> soma .Experiment :
86
96
"""Helper function that knows the Census schema path conventions."""
87
- return soma .Experiment .open (urlcat (base_uri , CENSUS_DATA_NAME , eb . name ), mode = "r" )
97
+ return soma .Experiment .open (get_experiment_uri (base_uri , eb ), mode = "r" )
88
98
89
99
90
100
def get_experiment_shape (base_uri : str , specs : list [ExperimentSpecification ]) -> dict [str , tuple [int , int ]]:
@@ -230,9 +240,10 @@ def validate_axis_dataframes_global_ids(
230
240
.concat ()
231
241
.to_pandas ()
232
242
)
233
- assert eb_info [eb .name ].n_obs == len (census_obs_df ) == exp .obs .count
234
- assert (len (census_obs_df ) == 0 ) or (census_obs_df .soma_joinid .max () + 1 == eb_info [eb .name ].n_obs )
235
- assert eb_info [eb .name ].dataset_ids == set (census_obs_df .dataset_id .unique ())
243
+ eb_info_key = get_experiment_uri (soma_path , eb )
244
+ assert eb_info [eb_info_key ].n_obs == len (census_obs_df ) == exp .obs .count
245
+ assert (len (census_obs_df ) == 0 ) or (census_obs_df .soma_joinid .max () + 1 == eb_info [eb_info_key ].n_obs )
246
+ assert eb_info [eb_info_key ].dataset_ids == set (census_obs_df .dataset_id .unique ())
236
247
237
248
# Validate that all obs soma_joinids are unique and in the range [0, n).
238
249
obs_unique_joinids = np .unique (census_obs_df .soma_joinid .to_numpy ())
@@ -254,13 +265,13 @@ def validate_axis_dataframes_global_ids(
254
265
del census_obs_df , obs_unique_joinids
255
266
256
267
# var
257
- n_vars = len (eb_info [eb . name ].vars )
268
+ n_vars = len (eb_info [eb_info_key ].vars )
258
269
259
270
census_var_df = (
260
271
exp .ms [MEASUREMENT_RNA_NAME ].var .read (column_names = ["feature_id" , "soma_joinid" ]).concat ().to_pandas ()
261
272
)
262
273
assert n_vars == len (census_var_df ) == exp .ms [MEASUREMENT_RNA_NAME ].var .count
263
- assert eb_info [eb . name ].vars == set (census_var_df .feature_id .array )
274
+ assert eb_info [eb_info_key ].vars == set (census_var_df .feature_id .array )
264
275
assert (len (census_var_df ) == 0 ) or (census_var_df .soma_joinid .max () + 1 == n_vars )
265
276
266
277
# Validate that all var soma_joinids are unique and in the range [0, n).
@@ -289,7 +300,7 @@ def _validate_axis_dataframes(
289
300
eb_info : dict [str , EbInfo ] = {}
290
301
for eb in experiment_specifications :
291
302
with soma .Collection .open (soma_path , context = SOMA_TileDB_Context ()) as census :
292
- census_data = census [CENSUS_DATA_NAME ]
303
+ census_data_collection = census [get_census_data_collection_name ( eb ) ]
293
304
dataset_id = dataset .dataset_id
294
305
ad = open_anndata (
295
306
dataset ,
@@ -298,8 +309,16 @@ def _validate_axis_dataframes(
298
309
var_column_names = CXG_VAR_COLUMNS_READ ,
299
310
filter_spec = eb .anndata_cell_filter_spec ,
300
311
)
301
- eb_info [eb .name ] = EbInfo ()
302
- se = census_data [eb .name ]
312
+ se = census_data_collection [eb .name ]
313
+
314
+ # NOTE: Since we are validating data for each experiment, we
315
+ # use the experiment uri as the key for the data that must be validated.
316
+ # Using just the experiment spec name would cause collisions as in the case
317
+ # of spatial and non-spatial experiments with the same name (experiment spec name)
318
+ # but stored under different census root collections
319
+ eb_info_key = get_experiment_uri (soma_path , eb )
320
+ eb_info [eb_info_key ] = EbInfo ()
321
+
303
322
dataset_obs = (
304
323
se .obs .read (
305
324
column_names = list (CENSUS_OBS_TABLE_SPEC .field_names ()),
@@ -326,11 +345,13 @@ def _validate_axis_dataframes(
326
345
if isinstance (dataset_obs [key ].dtype , pd .CategoricalDtype ):
327
346
dataset_obs [key ] = dataset_obs [key ].astype (dataset_obs [key ].cat .categories .dtype )
328
347
329
- assert len (dataset_obs ) == len (ad .obs ), f"{ dataset .dataset_id } /{ eb .name } obs length mismatch"
348
+ assert (
349
+ len (dataset_obs ) == len (ad .obs )
350
+ ), f"{ dataset .dataset_id } /{ eb .name } obs length mismatch soma experiment obs len: { len (dataset_obs )} != anndata obs len: { len (ad .obs )} "
330
351
if ad .n_obs > 0 :
331
- eb_info [eb . name ].n_obs += ad .n_obs
332
- eb_info [eb . name ].dataset_ids .add (dataset_id )
333
- eb_info [eb . name ].vars |= set (ad .var .index .array )
352
+ eb_info [eb_info_key ].n_obs += ad .n_obs
353
+ eb_info [eb_info_key ].dataset_ids .add (dataset_id )
354
+ eb_info [eb_info_key ].vars |= set (ad .var .index .array )
334
355
ad_obs = ad .obs [list (set (CXG_OBS_TERM_COLUMNS ) - set (CENSUS_OBS_STATS_COLUMNS ))].reset_index (
335
356
drop = True
336
357
)
@@ -343,11 +364,11 @@ def _validate_axis_dataframes(
343
364
def reduce_eb_info (results : Sequence [dict [str , EbInfo ]]) -> dict [str , EbInfo ]:
344
365
eb_info = {}
345
366
for res in results :
346
- for name , info in res .items ():
347
- if name not in eb_info :
348
- eb_info [name ] = copy .copy (info )
367
+ for eb_info_key , info in res .items ():
368
+ if eb_info_key not in eb_info :
369
+ eb_info [eb_info_key ] = copy .copy (info )
349
370
else :
350
- eb_info [name ].update (info )
371
+ eb_info [eb_info_key ].update (info )
351
372
return eb_info
352
373
353
374
eb_info = (
@@ -815,8 +836,9 @@ def validate_X_layers_schema(
815
836
with open_experiment (soma_path , eb ) as exp :
816
837
assert soma .Collection .exists (exp .ms [MEASUREMENT_RNA_NAME ].X .uri )
817
838
818
- n_obs = eb_info [eb .name ].n_obs
819
- n_vars = eb_info [eb .name ].n_vars
839
+ eb_info_key = get_experiment_uri (soma_path , eb )
840
+ n_obs = eb_info [eb_info_key ].n_obs
841
+ n_vars = eb_info [eb_info_key ].n_vars
820
842
assert n_obs == exp .obs .count
821
843
assert n_vars == exp .ms [MEASUREMENT_RNA_NAME ].var .count
822
844
@@ -1011,8 +1033,9 @@ def get_sparse_arrays(C: soma.Collection) -> list[soma.SparseNDArray]:
1011
1033
# first, confirm we set shape correctly, as the code uses it as the max bounding box
1012
1034
for eb in experiment_specifications :
1013
1035
with open_experiment (soma_path , eb ) as exp :
1014
- n_obs = eb_info [eb .name ].n_obs
1015
- n_vars = eb_info [eb .name ].n_vars
1036
+ eb_info_key = get_experiment_uri (soma_path , eb )
1037
+ n_obs = eb_info [eb_info_key ].n_obs
1038
+ n_vars = eb_info [eb_info_key ].n_vars
1016
1039
for layer_name in exp .ms [MEASUREMENT_RNA_NAME ].X :
1017
1040
assert exp .ms [MEASUREMENT_RNA_NAME ].X [layer_name ].shape == (n_obs , n_vars )
1018
1041
if "feature_dataset_presence_matrix" in exp .ms [MEASUREMENT_RNA_NAME ]:
0 commit comments