Skip to content

Reading RecordArray with BitMaskedArray contents parquet file + repartitioning + getitem + compute errors #583

@ikrommyd

Description

@ikrommyd

To reproduce:

import awkward as ak
import dask_awkward as dak
import numpy as np

layout = ak.contents.bitmaskedarray.BitMaskedArray(
    ak.index.Index(
        np.packbits(
            np.array(
                [
                    1,
                    1,
                    1,
                    1,
                    0,
                    0,
                    0,
                    0,
                    1,
                    0,
                    1,
                    0,
                    1,
                ],
                np.uint8,
            )
        )
    ),
    ak.contents.numpyarray.NumpyArray(
        np.array(
            [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 1.1, 2.2, 3.3, 4.4, 5.5, 6.6]
        )
    ),
    valid_when=True,
    length=13,
    lsb_order=False,
)
array = ak.zip({"a": layout, "b": layout}, depth_limit=1)
ak.to_parquet(array, "test.parquet")
array = dak.from_parquet("test.parquet")
array.a.compute() # is fine
array.repartition(one_to_n=5).a.compute() # errors

where I'm creating a layout and saving it to disk for demonstration purposes. Notice that if you do array = dak.from_awkward(array, npartitions=1) instead of writing it to disk and then reading it back up, it's fine.

Full error trace:

Cell In[2], line 41
     39 array = dak.from_parquet("test.parquet")
     40 array.a.compute() # is fine
---> 41 array.repartition(one_to_n=5).a.compute() # errors

File ~/miniforge3/envs/venv/lib/python3.12/site-packages/dask/base.py:375, in DaskMethodsMixin.compute(self, **kwargs)
    351 def compute(self, **kwargs):
    352     """Compute this dask collection
    353
    354     This turns a lazy Dask collection into its in-memory equivalent.
   (...)    373     dask.compute
    374     """
--> 375     (result,) = compute(self, traverse=False, **kwargs)
    376     return result

File ~/miniforge3/envs/venv/lib/python3.12/site-packages/dask/base.py:661, in compute(traverse, optimize_graph, scheduler, get, *args, **kwargs)
    658     postcomputes.append(x.__dask_postcompute__())
    660 with shorten_traceback():
--> 661     results = schedule(dsk, keys, **kwargs)
    663 return repack([f(r, *a) for r, (f, a) in zip(results, postcomputes)])

File ~/miniforge3/envs/venv/lib/python3.12/site-packages/dask_awkward/lib/structure.py:1400, in _subpart(data, parts, part)
   1398     return data
   1399 rows_per = len(data) // parts
-> 1400 return data[
   1401     part * rows_per : None if part == (parts - 1) else (part + 1) * rows_per
   1402 ]

File ~/miniforge3/envs/venv/lib/python3.12/site-packages/awkward/highlevel.py:1105, in Array.__getitem__(self, where)
    676 def __getitem__(self, where):
    677     """
    678     Args:
    679         where (many types supported; see below): Index of positions to
   (...)   1103     have the same dimension as the array being indexed.
   1104     """
-> 1105     with ak._errors.SlicingErrorContext(self, where):
   1106         # Handle named axis
   1107         (_, ndim) = self._layout.minmax_depth
   1108         named_axis = _get_named_axis(self)

File ~/miniforge3/envs/venv/lib/python3.12/site-packages/awkward/_errors.py:80, in ErrorContext.__exit__(self, exception_type, exception_value, traceback)
     78     self._slate.__dict__.clear()
     79     # Handle caught exception
---> 80     raise self.decorate_exception(exception_type, exception_value)
     81 else:
     82     # Step out of the way so that another ErrorContext can become primary.
     83     if self.primary() is self:

File ~/miniforge3/envs/venv/lib/python3.12/site-packages/awkward/highlevel.py:1113, in Array.__getitem__(self, where)
   1109 where = _normalize_named_slice(named_axis, where, ndim)
   1111 NamedAxis.mapping = named_axis
-> 1113 indexed_layout = prepare_layout(self._layout._getitem(where, NamedAxis))
   1115 if NamedAxis.mapping:
   1116     return ak.operations.ak_with_named_axis._impl(
   1117         indexed_layout,
   1118         named_axis=NamedAxis.mapping,
   (...)   1121         attrs=self._attrs,
   1122     )

File ~/miniforge3/envs/venv/lib/python3.12/site-packages/awkward/contents/content.py:545, in Content._getitem(self, where, named_axis)
    540 elif isinstance(where, slice) and where.step is None:
    541     # Ensure that start, stop are non-negative!
    542     start, stop, _, _ = self._backend.index_nplike.derive_slice_for_length(
    543         normalize_slice(where, nplike=self._backend.index_nplike), self.length
    544     )
--> 545     return self._getitem_range(start, stop)
    547 elif isinstance(where, slice):
    548     return self._getitem((where,), named_axis)

File ~/miniforge3/envs/venv/lib/python3.12/site-packages/awkward/contents/recordarray.py:455, in RecordArray._getitem_range(self, start, stop)
    446     return RecordArray(
    447         [],
    448         self._fields,
   (...)    451         backend=self._backend,
    452     )
    453 else:
    454     return RecordArray(
--> 455         [x._getitem_range(start, stop) for x in self._contents],
    456         self._fields,
    457         length,
    458         parameters=self._parameters,
    459         backend=self._backend,
    460     )

File ~/miniforge3/envs/venv/lib/python3.12/site-packages/awkward/contents/bitmaskedarray.py:513, in BitMaskedArray._getitem_range(self, start, stop)
    512 def _getitem_range(self, start: IndexType, stop: IndexType) -> Content:
--> 513     return self.to_ByteMaskedArray()._getitem_range(start, stop)

File ~/miniforge3/envs/venv/lib/python3.12/site-packages/awkward/contents/bitmaskedarray.py:395, in BitMaskedArray.to_ByteMaskedArray(self)
    387 bytemask = ak.index.Index8.empty(
    388     self._mask.length * 8, self._backend.index_nplike
    389 )
    390 assert (
    391     bytemask.nplike is self._backend.nplike
    392     and self._mask.nplike is self._backend.nplike
    393 )
    394 self._backend.maybe_kernel_error(
--> 395     self._backend[
    396         "awkward_BitMaskedArray_to_ByteMaskedArray",
    397         bytemask.dtype.type,
    398         self._mask.dtype.type,
    399     ](
    400         bytemask.data,
    401         self._mask.data,
    402         self._mask.length,
    403         False,  # this differs from the kernel call in 'bytemask'
    404         self._lsb_order,
    405     )
    406 )
    407 return ByteMaskedArray(
    408     bytemask[: self._length],
    409     self._content,
    410     self._valid_when,
    411     parameters=self._parameters,
    412 )

File ~/miniforge3/envs/venv/lib/python3.12/site-packages/awkward/_kernels.py:94, in NumpyKernel.__call__(self, *args)
     90 assert len(args) == len(self._impl.argtypes)
     92 args = materialize_if_virtual(*args)
---> 94 return self._impl(
     95     *(self._cast(x, t) for x, t in zip(args, self._impl.argtypes))
     96 )

File ~/miniforge3/envs/venv/lib/python3.12/site-packages/awkward/_kernels.py:95, in <genexpr>(.0)
     90 assert len(args) == len(self._impl.argtypes)
     92 args = materialize_if_virtual(*args)
     94 return self._impl(
---> 95     *(self._cast(x, t) for x, t in zip(args, self._impl.argtypes))
     96 )

File ~/miniforge3/envs/venv/lib/python3.12/site-packages/awkward/_kernels.py:83, in NumpyKernel._cast(cls, x, t)
     81         return ctypes.cast(x, t)
     82     else:
---> 83         raise AssertionError(
     84             f"Only NumPy buffers should be passed to Numpy Kernels, received {type(t).__name__}"
     85         )
     86 else:
     87     return x

AssertionError: Only NumPy buffers should be passed to Numpy Kernels, received PyCPointerType


See if this has been reported at https://github.com/scikit-hep/awkward/issues

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions