feat(breadbox): Support list of strings values matrix (#217)

jessica-cheng · web-flow · commit b446e81e1d1b · 2025-03-14T16:42:31.000-04:00
diff --git a/breadbox/alembic/versions/020788c82611_add_list_strings_as_matrix_value_type.py b/breadbox/alembic/versions/020788c82611_add_list_strings_as_matrix_value_type.py
@@ -0,0 +1,46 @@
+"""Add list_strings as matrix value type
+
+Revision ID: 020788c82611
+Revises: e593fefbe9fc
+Create Date: 2025-03-13 14:42:21.461226
+
+"""
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision = "020788c82611"
+down_revision = "e593fefbe9fc"
+branch_labels = None
+depends_on = None
+
+
+def upgrade():
+    # ### commands auto generated by Alembic - please adjust! ###
+    with op.batch_alter_table("matrix_dataset", schema=None) as batch_op:
+        batch_op.alter_column(
+            "value_type",
+            existing_type=sa.VARCHAR(length=11),
+            type_=sa.Enum(
+                "continuous", "categorical", "list_strings", name="valuetype"
+            ),
+            existing_nullable=False,
+        )
+
+    # ### end Alembic commands ###
+
+
+def downgrade():
+    # ### commands auto generated by Alembic - please adjust! ###
+    with op.batch_alter_table("matrix_dataset", schema=None) as batch_op:
+        batch_op.alter_column(
+            "value_type",
+            existing_type=sa.Enum(
+                "continuous", "categorical", "list_strings", name="valuetype"
+            ),
+            type_=sa.VARCHAR(length=11),
+            existing_nullable=False,
+        )
+
+    # ### end Alembic commands ###
diff --git a/breadbox/breadbox/compute/dataset_uploads_tasks.py b/breadbox/breadbox/compute/dataset_uploads_tasks.py
@@ -155,7 +155,9 @@ def dataset_upload(
             dataset_params.version,
             dataset_params.description,
         )
-        save_dataset_file(dataset_id, data_df, settings.filestore_location)
+        save_dataset_file(
+            dataset_id, data_df, dataset_params.value_type, settings.filestore_location
+        )
 
     else:
         index_type = _get_dimension_type(db, dataset_params.index_type)
diff --git a/breadbox/breadbox/io/data_validation.py b/breadbox/breadbox/io/data_validation.py
@@ -134,6 +134,26 @@ def _validate_dimension_type_metadata_file(
     return df
 
 
+def _parse_list_strings(val):
+    example_list_string = '["x", "y"]'
+    try:
+        deserialized_str_list = json.loads(val)
+    except Exception as e:
+        raise FileValidationError(
+            f"Value: {val} must be able to be deserialized into a list. Please make sure values for columns of type list_strings are a stringified list (ex: {example_list_string})"
+        ) from e
+
+    if not isinstance(deserialized_str_list, list):
+        raise FileValidationError(
+            f"Value: {val} must be able to be deserialized into a list. Please make sure values for columns of type list_strings are a stringified list (ex: {example_list_string})"
+        )
+
+    if not all(isinstance(x, str) for x in deserialized_str_list):
+        raise FileValidationError(
+            f"All values in {deserialized_str_list} must be a string (ex: {example_list_string})"
+        )
+
+
 def _validate_data_value_type(
     df: pd.DataFrame, value_type: ValueType, allowed_values: Optional[List]
 ):
@@ -170,6 +190,18 @@ def _validate_data_value_type(
 
         int_df = int_df.astype(int)
         return int_df
+    elif value_type == ValueType.list_strings:
+
+        def validate_list_strings(val):
+            if not pd.isnull(val):
+                _parse_list_strings(val)
+                return val
+            else:
+                # hdf5 will stringify 'None' or '<NA>'. Use empty string to represent NAs instead
+                return ""
+
+        df = df.applymap(validate_list_strings)
+        return df.astype(str)
     else:
         if not all([is_numeric_dtype(df[col].dtypes) for col in df.columns]):
             raise FileValidationError(
@@ -190,7 +222,7 @@ def _read_parquet(file, value_type: ValueType) -> pd.DataFrame:
     # parquet files have the types encoded in the file, so we'll convert after the fact
     if value_type == ValueType.continuous:
         dtype = "Float64"
-    elif value_type == ValueType.categorical:
+    elif value_type == ValueType.categorical or value_type == ValueType.list_strings:
         dtype = "string"
     else:
         raise ValueError(f"Invalid value type: {value_type}")
@@ -217,7 +249,7 @@ def _read_csv(file: BinaryIO, value_type: ValueType) -> pd.DataFrame:
 
     if value_type == ValueType.continuous:
         dtypes_ = dict(zip(cols, ["string"] + (["Float64"] * (len(cols) - 1))))
-    elif value_type == ValueType.categorical:
+    elif value_type == ValueType.categorical or value_type == ValueType.list_strings:
         dtypes_ = dict(zip(cols, ["string"] * len(cols)))
     else:
         raise ValueError(f"Invalid value type: {value_type}")
@@ -422,7 +454,7 @@ def validate_and_upload_dataset_files(
     )
 
     # TODO: Move save function to api layer. Need to make sure the db save is successful first
-    save_dataset_file(dataset_id, data_df, filestore_location)
+    save_dataset_file(dataset_id, data_df, value_type, filestore_location)
 
     return dataframe_validated_dimensions
 
@@ -575,27 +607,9 @@ def _validate_tabular_df_schema(
     dimension_type_identifier: str,
 ):
     def can_parse_list_strings(val):
-        example_list_string = '["x", "y"]'
-        if val is not None and not pd.isnull(val):
-            try:
-                deserialized_str_list = json.loads(val)
-            except Exception as e:
-                raise FileValidationError(
-                    f"Value: {val} must be able to be deserialized into a list. Please make sure values for columns of type list_strings are a stringified list (ex: {example_list_string})"
-                ) from e
-
-            if not isinstance(deserialized_str_list, list):
-                raise FileValidationError(
-                    f"Value: {val} must be able to be deserialized into a list. Please make sure values for columns of type list_strings are a stringified list (ex: {example_list_string})"
-                )
-
-            if not all(isinstance(x, str) for x in deserialized_str_list):
-                raise FileValidationError(
-                    f"All values in {deserialized_str_list} must be a string (ex: {example_list_string})"
-                )
-            return True
-        else:
-            return True
+        if not pd.isnull(val):
+            _parse_list_strings(val)
+        return True
 
     def get_checks_for_col(annotation_type: AnnotationType):
         checks = []
diff --git a/breadbox/breadbox/io/filestore_crud.py b/breadbox/breadbox/io/filestore_crud.py
@@ -1,5 +1,6 @@
 import os
 import shutil
+import json
 from typing import Any, List, Optional, Union
 
 import pandas as pd
@@ -11,13 +12,21 @@
 
 
 def save_dataset_file(
-    dataset_id: str, data_df: pd.DataFrame, filestore_location: str,
+    dataset_id: str,
+    data_df: pd.DataFrame,
+    value_type: ValueType,
+    filestore_location: str,
 ):
     base_path = os.path.join(filestore_location, dataset_id)
     os.makedirs(base_path)
 
+    if value_type == ValueType.list_strings:
+        dtype = "str"
+    else:
+        dtype = "float"
+
     write_hdf5_file(
-        get_file_location(dataset_id, filestore_location, DATA_FILE), data_df
+        get_file_location(dataset_id, filestore_location, DATA_FILE), data_df, dtype
     )
 
 
@@ -94,6 +103,10 @@ def get_df_by_value_type(
         # Convert numerical values back to origincal categorical value
         df = df.astype(int)
         df = df.applymap(lambda x: dataset_allowed_values[x])
+    elif value_type == ValueType.list_strings:
+        # NOTE: String data in HDF5 datasets is read as bytes by default
+        # len of byte encoded empty string should be 0
+        df = df.applymap(lambda x: json.loads(x) if len(x) != 0 else None)
     return df
 
 
diff --git a/breadbox/breadbox/io/hdf5_utils.py b/breadbox/breadbox/io/hdf5_utils.py
@@ -1,4 +1,4 @@
-from typing import List, Optional
+from typing import List, Optional, Literal
 
 import h5py
 import numpy as np
@@ -21,10 +21,15 @@ def create_index_dataset(f: h5py.File, key: str, idx: pd.Index):
     )
 
 
-def write_hdf5_file(path: str, df: pd.DataFrame):
+def write_hdf5_file(path: str, df: pd.DataFrame, dtype: Literal["float", "str"]):
     f = h5py.File(path, mode="w")
     try:
-        f.create_dataset("data", shape=df.shape, dtype=np.float64, data=df.values)
+        f.create_dataset(
+            "data",
+            shape=df.shape,
+            dtype=h5py.string_dtype() if dtype == "str" else np.float64,
+            data=df.values,
+        )
 
         create_index_dataset(f, "features", df.columns)
         create_index_dataset(f, "samples", df.index)
diff --git a/breadbox/breadbox/schemas/dataset.py b/breadbox/breadbox/schemas/dataset.py
@@ -24,6 +24,7 @@ class FeatureSampleIdentifier(enum.Enum):
 class ValueType(enum.Enum):
     continuous = "continuous"
     categorical = "categorical"
+    list_strings = "list_strings"
 
 
 class AnnotationType(enum.Enum):
diff --git a/breadbox/tests/api/test_dataset_uploads.py b/breadbox/tests/api/test_dataset_uploads.py