Add condition for using alternative chunked storage for only sparse matrices

jessica-cheng · jessica-cheng · commit 1da5067a9274 · 2025-03-20T12:08:07.000-04:00
diff --git a/breadbox/breadbox/io/data_validation.py b/breadbox/breadbox/io/data_validation.py
@@ -197,10 +197,10 @@ def validate_list_strings(val):
                 _parse_list_strings(val)
                 return val
             else:
-                # hdf5 will stringify 'None' or '<NA>'. Use empty string to represent NAs instead
                 return pd.NA
 
         df = df.applymap(validate_list_strings)
+        # astype(str) will stringify 'None' or '<NA>'. Using pd.StringDtype() will preserve <NA>
         return df.astype(pd.StringDtype())
     else:
         if not all([is_numeric_dtype(df[col].dtypes) for col in df.columns]):
diff --git a/breadbox/breadbox/io/hdf5_utils.py b/breadbox/breadbox/io/hdf5_utils.py
@@ -24,32 +24,35 @@ def create_index_dataset(f: h5py.File, key: str, idx: pd.Index):
 def write_hdf5_file(path: str, df: pd.DataFrame, dtype: Literal["float", "str"]):
     f = h5py.File(path, mode="w")
     try:
-        dataset = f.create_dataset(
-            "data",
-            shape=df.shape,
-            dtype=h5py.string_dtype() if dtype == "str" else np.float64,
-            # data=df.values,
-            chunks=(1, 1),
-        )
-        rows, cols = np.where(df.notnull())
-        for row, col in zip(rows, cols):
-            dataset[row, col] = df.iloc[row, col]
-
-        # also took far too long
-        # for col in range(df.shape[1]):
-        #     if (~(pd.isna(df.iloc[:,col]))).sum() == 0:
-        #         continue
-        #     for row in range(df.shape[0]):
-        #         value = df.iloc[row, col]
-        #         if not pd.isna(value):
-        #             dataset[row, col] = value
-
-        # this literally took forever
-        # for row in range(df.shape[0]):
-        #     for col in range(df.shape[1]):
-        #         value = df.iloc[row, col]
-        #         if not pd.isna(value):
-        #             dataset[row, col] = value
+        # Get the row,col positions where df values are not null
+        rows_idx, cols_idx = np.where(df.notnull())
+        total_nulls = df.size - len(rows_idx)
+        # Determine whether matrix is considered sparse (~2/3 elements are null). Use chunked storage for sparse matrices for more optimal storage
+        if total_nulls / df.size > 0.6:
+            dataset = f.create_dataset(
+                "data",
+                shape=df.shape,
+                dtype=h5py.string_dtype() if dtype == "str" else np.float64,
+                chunks=(
+                    1,
+                    1,
+                ),  # Arbitrarily set size since it at least appears to yield smaller storage size than autochunking
+            )
+            # only insert nonnull values into hdf5 at given positions
+            for row_idx, col_idx in zip(rows_idx, cols_idx):
+                dataset[row_idx, col_idx] = df.iloc[row_idx, col_idx]
+        else:
+            if dtype == "str":
+                # NOTE: hdf5 will fail to stringify None or <NA>. Use empty string to represent NAs instead
+                df = df.fillna("")
+
+            # NOTE: For a large and dense string matrix, the size of the hdf5 will be very large. Right now, list of string matrices are a very rare use case and it is unlikely we'll encounter one that is not sparse. However, if that changes, we should consider other hdf5 size optimization methods such as compression
+            f.create_dataset(
+                "data",
+                shape=df.shape,
+                dtype=h5py.string_dtype() if dtype == "str" else np.float64,
+                data=df.values,
+            )
 
         create_index_dataset(f, "features", df.columns)
         create_index_dataset(f, "samples", df.index)