Skip to content

Commit 1da5067

Browse files
committed
Add condition for using alternative chunked storage for only sparse matrices
1 parent 66f4246 commit 1da5067

File tree

2 files changed

+30
-27
lines changed

2 files changed

+30
-27
lines changed

breadbox/breadbox/io/data_validation.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -197,10 +197,10 @@ def validate_list_strings(val):
197197
_parse_list_strings(val)
198198
return val
199199
else:
200-
# hdf5 will stringify 'None' or '<NA>'. Use empty string to represent NAs instead
201200
return pd.NA
202201

203202
df = df.applymap(validate_list_strings)
203+
# astype(str) will stringify 'None' or '<NA>'. Using pd.StringDtype() will preserve <NA>
204204
return df.astype(pd.StringDtype())
205205
else:
206206
if not all([is_numeric_dtype(df[col].dtypes) for col in df.columns]):

breadbox/breadbox/io/hdf5_utils.py

Lines changed: 29 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -24,32 +24,35 @@ def create_index_dataset(f: h5py.File, key: str, idx: pd.Index):
2424
def write_hdf5_file(path: str, df: pd.DataFrame, dtype: Literal["float", "str"]):
2525
f = h5py.File(path, mode="w")
2626
try:
27-
dataset = f.create_dataset(
28-
"data",
29-
shape=df.shape,
30-
dtype=h5py.string_dtype() if dtype == "str" else np.float64,
31-
# data=df.values,
32-
chunks=(1, 1),
33-
)
34-
rows, cols = np.where(df.notnull())
35-
for row, col in zip(rows, cols):
36-
dataset[row, col] = df.iloc[row, col]
37-
38-
# also took far too long
39-
# for col in range(df.shape[1]):
40-
# if (~(pd.isna(df.iloc[:,col]))).sum() == 0:
41-
# continue
42-
# for row in range(df.shape[0]):
43-
# value = df.iloc[row, col]
44-
# if not pd.isna(value):
45-
# dataset[row, col] = value
46-
47-
# this literally took forever
48-
# for row in range(df.shape[0]):
49-
# for col in range(df.shape[1]):
50-
# value = df.iloc[row, col]
51-
# if not pd.isna(value):
52-
# dataset[row, col] = value
27+
# Get the row,col positions where df values are not null
28+
rows_idx, cols_idx = np.where(df.notnull())
29+
total_nulls = df.size - len(rows_idx)
30+
# Determine whether matrix is considered sparse (~2/3 elements are null). Use chunked storage for sparse matrices for more optimal storage
31+
if total_nulls / df.size > 0.6:
32+
dataset = f.create_dataset(
33+
"data",
34+
shape=df.shape,
35+
dtype=h5py.string_dtype() if dtype == "str" else np.float64,
36+
chunks=(
37+
1,
38+
1,
39+
), # Arbitrarily set size since it at least appears to yield smaller storage size than autochunking
40+
)
41+
# only insert nonnull values into hdf5 at given positions
42+
for row_idx, col_idx in zip(rows_idx, cols_idx):
43+
dataset[row_idx, col_idx] = df.iloc[row_idx, col_idx]
44+
else:
45+
if dtype == "str":
46+
# NOTE: hdf5 will fail to stringify None or <NA>. Use empty string to represent NAs instead
47+
df = df.fillna("")
48+
49+
# NOTE: For a large and dense string matrix, the size of the hdf5 will be very large. Right now, list of string matrices are a very rare use case and it is unlikely we'll encounter one that is not sparse. However, if that changes, we should consider other hdf5 size optimization methods such as compression
50+
f.create_dataset(
51+
"data",
52+
shape=df.shape,
53+
dtype=h5py.string_dtype() if dtype == "str" else np.float64,
54+
data=df.values,
55+
)
5356

5457
create_index_dataset(f, "features", df.columns)
5558
create_index_dataset(f, "samples", df.index)

0 commit comments

Comments
 (0)