Open
Description
Describe the issue:
Applying map_blocks
to a Dask Array constructed by from_array
from HDF5 dataset raises TypeError: h5py objects cannot be pickled
. This only occurs when using distributed
.
This maybe related to #860 and #861. In fact, the below minimal verifiable example is borrowed from test_h5py_serialize
mentioned in #860.
Minimal Complete Verifiable Example:
# from dask.utils import SerializableLock
import distributed
import h5py
import dask.array as da
client = distributed.Client() # Error does not occur if this is commented out
fn = 'temporary_file_for_test.h5'
lock = True # lock = SerializableLock('hdf5') and lock = False also leads to an error.
with h5py.File(fn, mode='w') as f:
x = f.create_dataset('/group/x', shape=(4,), dtype='i4',
chunks=(2,))
x[:] = [1, 2, 3, 4]
with h5py.File(fn, mode='r') as f:
dset = f['/group/x']
x = da.from_array(dset, chunks=dset.chunks, lock=lock)
def func(array):
return array**2
res = da.map_blocks(func, x).compute()
Anything else we need to know?:
Traceback on Jupyter Lab looks like following:
2025-02-13 22:00:12,875 - distributed.protocol.pickle - ERROR - Failed to serialize <ToPickle: HighLevelGraph with 1 layers.
<dask.highlevelgraph.HighLevelGraph object at 0x7fc9a461f350>
0. 140504024525248
>.
Traceback (most recent call last):
File "[/home/sm69/.conda/envs/pyathena/lib/python3.13/site-packages/distributed/protocol/pickle.py", line 73](http://127.0.0.1:20201/home/sm69/.conda/envs/pyathena/lib/python3.13/site-packages/distributed/protocol/pickle.py#line=72), in dumps
result = cloudpickle.dumps(x, **dump_kwargs)
File "[/home/sm69/.conda/envs/pyathena/lib/python3.13/site-packages/cloudpickle/cloudpickle.py", line 1537](http://127.0.0.1:20201/home/sm69/.conda/envs/pyathena/lib/python3.13/site-packages/cloudpickle/cloudpickle.py#line=1536), in dumps
cp.dump(obj)
~~~~~~~^^^^^
File "[/home/sm69/.conda/envs/pyathena/lib/python3.13/site-packages/cloudpickle/cloudpickle.py", line 1303](http://127.0.0.1:20201/home/sm69/.conda/envs/pyathena/lib/python3.13/site-packages/cloudpickle/cloudpickle.py#line=1302), in dump
return super().dump(obj)
~~~~~~~~~~~~^^^^^
File "[/home/sm69/.conda/envs/pyathena/lib/python3.13/site-packages/h5py/_hl/base.py", line 366](http://127.0.0.1:20201/home/sm69/.conda/envs/pyathena/lib/python3.13/site-packages/h5py/_hl/base.py#line=365), in __getnewargs__
raise TypeError("h5py objects cannot be pickled")
TypeError: h5py objects cannot be pickled
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "[/home/sm69/.conda/envs/pyathena/lib/python3.13/site-packages/distributed/protocol/pickle.py", line 77](http://127.0.0.1:20201/home/sm69/.conda/envs/pyathena/lib/python3.13/site-packages/distributed/protocol/pickle.py#line=76), in dumps
result = cloudpickle.dumps(x, **dump_kwargs)
File "[/home/sm69/.conda/envs/pyathena/lib/python3.13/site-packages/cloudpickle/cloudpickle.py", line 1537](http://127.0.0.1:20201/home/sm69/.conda/envs/pyathena/lib/python3.13/site-packages/cloudpickle/cloudpickle.py#line=1536), in dumps
cp.dump(obj)
~~~~~~~^^^^^
File "[/home/sm69/.conda/envs/pyathena/lib/python3.13/site-packages/cloudpickle/cloudpickle.py", line 1303](http://127.0.0.1:20201/home/sm69/.conda/envs/pyathena/lib/python3.13/site-packages/cloudpickle/cloudpickle.py#line=1302), in dump
return super().dump(obj)
~~~~~~~~~~~~^^^^^
File "[/home/sm69/.conda/envs/pyathena/lib/python3.13/site-packages/h5py/_hl/base.py", line 366](http://127.0.0.1:20201/home/sm69/.conda/envs/pyathena/lib/python3.13/site-packages/h5py/_hl/base.py#line=365), in __getnewargs__
raise TypeError("h5py objects cannot be pickled")
TypeError: h5py objects cannot be pickled
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
File [~/.conda/envs/pyathena/lib/python3.13/site-packages/distributed/protocol/pickle.py:73](http://127.0.0.1:20201/~/.conda/envs/pyathena/lib/python3.13/site-packages/distributed/protocol/pickle.py#line=72), in dumps(x, buffer_callback, protocol)
72 buffers.clear()
---> 73 result = cloudpickle.dumps(x, **dump_kwargs)
74 except Exception:
File [~/.conda/envs/pyathena/lib/python3.13/site-packages/cloudpickle/cloudpickle.py:1537](http://127.0.0.1:20201/~/.conda/envs/pyathena/lib/python3.13/site-packages/cloudpickle/cloudpickle.py#line=1536), in dumps(obj, protocol, buffer_callback)
1536 cp = Pickler(file, protocol=protocol, buffer_callback=buffer_callback)
-> 1537 cp.dump(obj)
1538 return file.getvalue()
File [~/.conda/envs/pyathena/lib/python3.13/site-packages/cloudpickle/cloudpickle.py:1303](http://127.0.0.1:20201/~/.conda/envs/pyathena/lib/python3.13/site-packages/cloudpickle/cloudpickle.py#line=1302), in Pickler.dump(self, obj)
1302 try:
-> 1303 return super().dump(obj)
1304 except RuntimeError as e:
File [~/.conda/envs/pyathena/lib/python3.13/site-packages/h5py/_hl/base.py:366](http://127.0.0.1:20201/~/.conda/envs/pyathena/lib/python3.13/site-packages/h5py/_hl/base.py#line=365), in HLObject.__getnewargs__(self)
357 """Disable pickle.
358
359 Handles for HDF5 objects can't be reliably deserialised, because the
(...)
364 limitations, look at the h5pickle project on PyPI.
365 """
--> 366 raise TypeError("h5py objects cannot be pickled")
TypeError: h5py objects cannot be pickled
During handling of the above exception, another exception occurred:
TypeError Traceback (most recent call last)
File [~/.conda/envs/pyathena/lib/python3.13/site-packages/distributed/protocol/serialize.py:366](http://127.0.0.1:20201/~/.conda/envs/pyathena/lib/python3.13/site-packages/distributed/protocol/serialize.py#line=365), in serialize(x, serializers, on_error, context, iterate_collection)
365 try:
--> 366 header, frames = dumps(x, context=context) if wants_context else dumps(x)
367 header["serializer"] = name
File [~/.conda/envs/pyathena/lib/python3.13/site-packages/distributed/protocol/serialize.py:78](http://127.0.0.1:20201/~/.conda/envs/pyathena/lib/python3.13/site-packages/distributed/protocol/serialize.py#line=77), in pickle_dumps(x, context)
76 writeable.append(not f.readonly)
---> 78 frames[0] = pickle.dumps(
79 x,
80 buffer_callback=buffer_callback,
81 protocol=context.get("pickle-protocol", None) if context else None,
82 )
83 header = {
84 "serializer": "pickle",
85 "writeable": tuple(writeable),
86 }
File [~/.conda/envs/pyathena/lib/python3.13/site-packages/distributed/protocol/pickle.py:77](http://127.0.0.1:20201/~/.conda/envs/pyathena/lib/python3.13/site-packages/distributed/protocol/pickle.py#line=76), in dumps(x, buffer_callback, protocol)
76 buffers.clear()
---> 77 result = cloudpickle.dumps(x, **dump_kwargs)
78 except Exception:
File [~/.conda/envs/pyathena/lib/python3.13/site-packages/cloudpickle/cloudpickle.py:1537](http://127.0.0.1:20201/~/.conda/envs/pyathena/lib/python3.13/site-packages/cloudpickle/cloudpickle.py#line=1536), in dumps(obj, protocol, buffer_callback)
1536 cp = Pickler(file, protocol=protocol, buffer_callback=buffer_callback)
-> 1537 cp.dump(obj)
1538 return file.getvalue()
File [~/.conda/envs/pyathena/lib/python3.13/site-packages/cloudpickle/cloudpickle.py:1303](http://127.0.0.1:20201/~/.conda/envs/pyathena/lib/python3.13/site-packages/cloudpickle/cloudpickle.py#line=1302), in Pickler.dump(self, obj)
1302 try:
-> 1303 return super().dump(obj)
1304 except RuntimeError as e:
File [~/.conda/envs/pyathena/lib/python3.13/site-packages/h5py/_hl/base.py:366](http://127.0.0.1:20201/~/.conda/envs/pyathena/lib/python3.13/site-packages/h5py/_hl/base.py#line=365), in HLObject.__getnewargs__(self)
357 """Disable pickle.
358
359 Handles for HDF5 objects can't be reliably deserialised, because the
(...)
364 limitations, look at the h5pickle project on PyPI.
365 """
--> 366 raise TypeError("h5py objects cannot be pickled")
TypeError: h5py objects cannot be pickled
The above exception was the direct cause of the following exception:
TypeError Traceback (most recent call last)
Cell In[1], line 19
17 def func(array):
18 return array**2
---> 19 res = da.map_blocks(func, x).compute()
File [~/.conda/envs/pyathena/lib/python3.13/site-packages/dask/base.py:372](http://127.0.0.1:20201/~/.conda/envs/pyathena/lib/python3.13/site-packages/dask/base.py#line=371), in DaskMethodsMixin.compute(self, **kwargs)
348 def compute(self, **kwargs):
349 """Compute this dask collection
350
351 This turns a lazy Dask collection into its in-memory equivalent.
(...)
370 dask.compute
371 """
--> 372 (result,) = compute(self, traverse=False, **kwargs)
373 return result
File [~/.conda/envs/pyathena/lib/python3.13/site-packages/dask/base.py:660](http://127.0.0.1:20201/~/.conda/envs/pyathena/lib/python3.13/site-packages/dask/base.py#line=659), in compute(traverse, optimize_graph, scheduler, get, *args, **kwargs)
657 postcomputes.append(x.__dask_postcompute__())
659 with shorten_traceback():
--> 660 results = schedule(dsk, keys, **kwargs)
662 return repack([f(r, *a) for r, (f, a) in zip(results, postcomputes)])
File [~/.conda/envs/pyathena/lib/python3.13/site-packages/distributed/protocol/serialize.py:392](http://127.0.0.1:20201/~/.conda/envs/pyathena/lib/python3.13/site-packages/distributed/protocol/serialize.py#line=391), in serialize(x, serializers, on_error, context, iterate_collection)
390 except Exception:
391 raise TypeError(msg) from exc
--> 392 raise TypeError(msg, str_x) from exc
393 else: # pragma: nocover
394 raise ValueError(f"{on_error=}; expected 'message' or 'raise'")
TypeError: ('Could not serialize object of type HighLevelGraph', '<ToPickle: HighLevelGraph with 1 layers.\n<dask.highlevelgraph.HighLevelGraph object at 0x7fc9a461f350>\n 0. 140504024525248\n>')
Environment:
- Dask version: 2024.12.1
- Python version: 3.13.1
- Operating System: linux
- Install method (conda, pip, source): conda (mamba, actually)