77import cooler
88
99import dask
10- from dask .dataframe .core import new_dd_object
1110from dask .base import tokenize
1211import dask .dataframe as dd
1312import dask .array as da
@@ -21,7 +20,6 @@ def get_group_info(path, grouppath, keys):
2120 keys = list (grp .keys ())
2221
2322 nrows = len (grp [keys [0 ]])
24- dtypes = {key : grp [key ].dtype for key in keys }
2523
2624 categoricals = {}
2725 for key in keys :
@@ -33,6 +31,10 @@ def get_group_info(path, grouppath, keys):
3331 meta = pd .DataFrame (
3432 {key : np .array ([], dtype = grp [key ].dtype ) for key in keys },
3533 columns = keys )
34+
35+ for key in categoricals :
36+ meta [key ] = pd .Categorical ([],
37+ categories = categoricals [key ], ordered = True )
3638
3739 return nrows , keys , meta , categoricals
3840
@@ -63,7 +65,7 @@ def restore_categories(data, categorical_columns):
6365 for key , category_dict in categorical_columns .items ():
6466 data [key ] = pd .Categorical .from_codes (
6567 data [key ],
66- categories ,
68+ category_dict ,
6769 ordered = True )
6870 return data
6971
@@ -103,7 +105,7 @@ def daskify(filepath, grouppath, keys=None, chunksize=int(10e6), index=None,
103105
104106 # Make a unique task name
105107 token = tokenize (filepath , grouppath , chunksize , keys )
106- task_name = 'daskify-h5py-table' + token
108+ task_name = 'daskify-h5py-table- ' + token
107109
108110 # Partition the table
109111 divisions = (0 ,) + tuple (range (- 1 , nrows , chunksize ))[1 :]
@@ -120,7 +122,7 @@ def daskify(filepath, grouppath, keys=None, chunksize=int(10e6), index=None,
120122 dsk [task_name , i ] = (pd .DataFrame , data_dict , None , meta .columns )
121123
122124 # Generate ddf from dask graph
123- df = new_dd_object (dsk , task_name , meta , divisions )
125+ df = dd . DataFrame (dsk , task_name , meta , divisions )
124126 if index is not None :
125127 df = df .set_index (index , sorted = True , drop = False )
126128 return df
0 commit comments