Skip to content

Commit 0327403

Browse files
committed
Extract categorical type information
1 parent 7a70bb7 commit 0327403

File tree

2 files changed

+8
-6
lines changed

2 files changed

+8
-6
lines changed

cooler/contrib/dask.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@
77
import cooler
88

99
import dask
10-
from dask.dataframe.core import new_dd_object
1110
from dask.base import tokenize
1211
import dask.dataframe as dd
1312
import dask.array as da
@@ -21,7 +20,6 @@ def get_group_info(path, grouppath, keys):
2120
keys = list(grp.keys())
2221

2322
nrows = len(grp[keys[0]])
24-
dtypes = {key: grp[key].dtype for key in keys}
2523

2624
categoricals = {}
2725
for key in keys:
@@ -33,6 +31,10 @@ def get_group_info(path, grouppath, keys):
3331
meta = pd.DataFrame(
3432
{key: np.array([], dtype=grp[key].dtype) for key in keys},
3533
columns=keys)
34+
35+
for key in categoricals:
36+
meta[key] = pd.Categorical([],
37+
categories=categoricals[key], ordered=True)
3638

3739
return nrows, keys, meta, categoricals
3840

@@ -63,7 +65,7 @@ def restore_categories(data, categorical_columns):
6365
for key, category_dict in categorical_columns.items():
6466
data[key] = pd.Categorical.from_codes(
6567
data[key],
66-
categories,
68+
category_dict,
6769
ordered=True)
6870
return data
6971

@@ -103,7 +105,7 @@ def daskify(filepath, grouppath, keys=None, chunksize=int(10e6), index=None,
103105

104106
# Make a unique task name
105107
token = tokenize(filepath, grouppath, chunksize, keys)
106-
task_name = 'daskify-h5py-table' + token
108+
task_name = 'daskify-h5py-table-' + token
107109

108110
# Partition the table
109111
divisions = (0,) + tuple(range(-1, nrows, chunksize))[1:]
@@ -120,7 +122,7 @@ def daskify(filepath, grouppath, keys=None, chunksize=int(10e6), index=None,
120122
dsk[task_name, i] = (pd.DataFrame, data_dict, None, meta.columns)
121123

122124
# Generate ddf from dask graph
123-
df = new_dd_object(dsk, task_name, meta, divisions)
125+
df = dd.DataFrame(dsk, task_name, meta, divisions)
124126
if index is not None:
125127
df = df.set_index(index, sorted=True, drop=False)
126128
return df

cooler/util.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -556,7 +556,7 @@ def unstarred(args):
556556

557557
def make_meta(x, index=None):
558558
"""
559-
Extracted from dask/dataframe/utils.py
559+
Extracted from dask/dataframe/utils.py (BSD licensed)
560560
561561
Create an empty pandas object containing the desired metadata.
562562

0 commit comments

Comments
 (0)