Skip to content

Renaming of feature_key column of points #996

@LouisK92

Description

@LouisK92

I was trying to rename columns of a spatialdata points object and ran into issues regarding the column that is labeled as feature_key.

Some reproducible ways of how I tried it:

Sdata setup

import pandas as pd
import dask.dataframe as dd
import spatialdata as sd

df = pd.DataFrame({"gene": ["A", "B", "C"], "x": [1, 2, 3], "y": [1, 1, 1]})
df = dd.from_pandas(df, npartitions=1)

sdata = sd.SpatialData(points={"points": sd.models.PointsModel.parse(df, feature_key="gene")})

Example 1

sdata['points'] = sdata['points'].rename(columns={"gene": "gene_symbol"})

leads to

---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
File ~/miniconda3/envs/g3/lib/python3.11/site-packages/pandas/core/indexes/base.py:3812, in Index.get_loc(self, key)
   3811 try:
-> 3812     return self._engine.get_loc(casted_key)
   3813 except KeyError as err:

File pandas/_libs/index.pyx:167, in pandas._libs.index.IndexEngine.get_loc()

File pandas/_libs/index.pyx:196, in pandas._libs.index.IndexEngine.get_loc()

File pandas/_libs/hashtable_class_helper.pxi:7088, in pandas._libs.hashtable.PyObjectHashTable.get_item()

File pandas/_libs/hashtable_class_helper.pxi:7096, in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: 'gene'

The above exception was the direct cause of the following exception:

KeyError                                  Traceback (most recent call last)
Cell In[60], line 9
      6 df = dd.from_pandas(df, npartitions=1)
      8 sdata = sd.SpatialData(points={"points": sd.models.PointsModel.parse(df, feature_key="gene")})
----> 9 sdata['points'] = sdata['points'].rename(columns={"gene": "gene_symbol"})
     12 sdata['points']['gene_symbol'] = sdata['points']['gene']
     13 del sdata['points']['gene']

File ~/miniconda3/envs/g3/lib/python3.11/site-packages/spatialdata/_core/spatialdata.py:2393, in SpatialData.__setitem__(self, key, value)
   2382 def __setitem__(self, key: str, value: SpatialElement | AnnData) -> None:
   2383     """
   2384     Add the element to the SpatialData object.
   2385 
   (...)
   2391         The element.
   2392     """
-> 2393     schema = get_model(value)
   2394     if schema in (Image2DModel, Image3DModel):
   2395         self.images[key] = value

File ~/miniconda3/envs/g3/lib/python3.11/site-packages/spatialdata/models/models.py:1193, in get_model(e)
   1191     return _validate_and_return(ShapesModel, e)
   1192 if isinstance(e, DaskDataFrame):
-> 1193     return _validate_and_return(PointsModel, e)
   1194 if isinstance(e, AnnData):
   1195     return _validate_and_return(TableModel, e)

File ~/miniconda3/envs/g3/lib/python3.11/site-packages/spatialdata/models/models.py:1178, in get_model.<locals>._validate_and_return(schema, e)
   1174 def _validate_and_return(
   1175     schema: Schema_t,
   1176     e: SpatialElement,
   1177 ) -> Schema_t:
-> 1178     schema().validate(e)
   1179     return schema

File ~/miniconda3/envs/g3/lib/python3.11/site-packages/spatialdata/models/models.py:653, in PointsModel.validate(cls, data)
    651 if ATTRS_KEY in data.attrs and "feature_key" in data.attrs[ATTRS_KEY]:
    652     feature_key = data.attrs[ATTRS_KEY][cls.FEATURE_KEY]
--> 653     if not isinstance(data[feature_key].dtype, CategoricalDtype):
    654         logger.info(f"Feature key `{feature_key}`could be of type `pd.Categorical`. Consider casting it.")

File ~/miniconda3/envs/g3/lib/python3.11/site-packages/dask/dataframe/core.py:4955, in DataFrame.__getitem__(self, key)
   4952         return self.loc[key]
   4954 # error is raised from pandas
-> 4955 meta = self._meta[_extract_meta(key)]
   4956 dsk = partitionwise_graph(operator.getitem, name, self, key)
...
   3822     #  InvalidIndexError. Otherwise we fall through and re-raise
   3823     #  the TypeError.
   3824     self._check_indexing_error(key)

KeyError: 'gene'

Example 2 (error occurs after writing and reading)

sdata['points']['gene_symbol'] = sdata['points']['gene']
del sdata['points']['gene']

sdata.write("sdata_test.zarr")
sd.read_zarr("sdata_test.zarr")

leads to

---------------------------------------------------------------------------
ValidationError                           Traceback (most recent call last)
Cell In[59], line 17
     14 #sdata['points'].attrs["spatialdata_attrs"]["feature_key"] = "gene_symbol"
     16 sdata.write("sdata_test.zarr")
---> 17 sd.read_zarr("sdata_test.zarr")
     20 #sdata = sd.SpatialData(
     21 #    transcripts=sdata['transcripts'],
     22 #    counts=sdata['counts'],
   (...)
     25 #)
     26 #sdata

File ~/miniconda3/envs/g3/lib/python3.11/site-packages/spatialdata/_io/io_zarr.py:229, in read_zarr(store, selection, on_bad_files)
    226 else:
    227     attrs = None
--> 229 sdata = SpatialData(
    230     images=images,
    231     labels=labels,
    232     points=points,
    233     shapes=shapes,
    234     tables=tables,
    235     attrs=attrs,
    236 )
    237 sdata.path = Path(store)
    238 return sdata

File ~/miniconda3/envs/g3/lib/python3.11/site-packages/spatialdata/_utils.py:270, in _deprecation_alias.<locals>.deprecation_decorator.<locals>.wrapper(*args, **kwargs)
    268     raise ValueError("version for deprecation must be specified")
    269 rename_kwargs(f.__name__, kwargs, alias_copy, class_name, library, version)
--> 270 return f(*args, **kwargs)

File ~/miniconda3/envs/g3/lib/python3.11/site-packages/spatialdata/_core/spatialdata.py:156, in SpatialData.__init__(self, images, labels, points, shapes, tables, attrs)
    151     duplicates = {x for x in element_names if element_names.count(x) > 1}
    152     raise KeyError(
    153         f"Element names must be unique. The following element names are used multiple times: {duplicates}"
    154     )
--> 156 with raise_validation_errors(
    157     title="Cannot construct SpatialData object, input contains invalid elements.\n"
    158     "For renaming, please see the discussion here https://github.com/scverse/spatialdata/discussions/707 .",
    159     exc_type=(ValueError, KeyError),
    160 ) as collect_error:
    161     if images is not None:
    162         for k, v in images.items():

File ~/miniconda3/envs/g3/lib/python3.11/site-packages/spatialdata/_core/validation.py:382, in raise_validation_errors.__exit__(self, exc_type, exc_val, exc_tb)
    380 # Exceptions were collected that we want to raise as a combined validation error.
    381 if self._collector.errors:
--> 382     raise ValidationError(title=self._message, errors=self._collector.errors)
    383 return True

ValidationError: Cannot construct SpatialData object, input contains invalid elements.
For renaming, please see the discussion here https://github.com/scverse/spatialdata/discussions/707 .
  points/points: gene

Solution

I got it working with a small adjustment of example 2:

sdata['points']['gene_symbol'] = sdata['points']['gene']
del sdata['points']['gene']
sdata['points'].attrs["spatialdata_attrs"]["feature_key"] = "gene_symbol"

sdata.write("sdata_test.zarr")
sd.read_zarr("sdata_test.zarr")

Expected behaviour

I think the expected behaviour would be that

  • sdata['points'] = sdata['points'].rename(columns={"gene": "gene_symbol"}) works out of the box, renaming the feature_key (sdata['points'].attrs["spatialdata_attrs"]["feature_key"]) as well
  • del sdata['points']['gene'] would delete the column, but also the entry in sdata['points'].attrs["spatialdata_attrs"]["feature_key"] (maybe with a warning, telling that the feature_key column is deleted and not just a standard column)

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions