zephyr: pack pickle inside parquet shuffle (#3656)

ravwojdyla-agent · ravwojdyla · claude · web-flow · commit f74cda9f4744 · 2026-03-13T22:01:26.000-07:00
## Summary - When scatter items aren't Arrow-serializable, serialize them via `pickle.dumps()` into a binary `pickled` column in Parquet instead of falling back to per-chunk `.pkl` files - Eliminates the M×RxC file blowup in pickle mode while preserving single-file-per-shard compactness and predicate pushdown - Adds `is_pickled` field to `ParquetDiskChunk` for transparent deserialization on read Fixes #3640 ## Test plan - [x] Existing `test_group_by_non_vortex_serializable` passes on all 3 backends (local, iris, ray) — exercises the full pickle-in-parquet scatter+reduce path with `frozenset` items - [x] New `test_parquet_disk_chunk_pickle_roundtrip` — unit test for pickle envelope write/read - [x] All 49 groupby tests pass, all 61 execution tests pass - [x] Pre-commit clean 🤖 Generated with [Claude Code](https://claude.com/claude-code) --------- Co-authored-by: Rafal Wojdyla <ravwojdyla@gmail.com> Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com> Co-authored-by: claude[bot] <41898282+claude[bot]@users.noreply.github.com> Co-authored-by: Rafal Wojdyla <ravwojdyla@users.noreply.github.com>
diff --git a/lib/zephyr/src/zephyr/execution.py b/lib/zephyr/src/zephyr/execution.py
@@ -63,6 +63,7 @@ def __iter__(self) -> Iterator: ...
 _ZEPHYR_SHUFFLE_SHARD_IDX_COL = "shard_idx"
 _ZEPHYR_SHUFFLE_CHUNK_IDX_COL = "chunk_idx"
 _ZEPHYR_SHUFFLE_ITEM_COL = "item"
+_ZEPHYR_SHUFFLE_PICKLED_COL = "pickled"
 
 
 @dataclass(frozen=True)
@@ -114,34 +115,38 @@ class ParquetDiskChunk:
     for different (shard_idx, chunk_idx) pairs. Each chunk is pre-sorted
     by key, preserving the invariant needed for k-way merge in Reduce.
 
-    Items are stored wrapped in an envelope struct with routing metadata::
+    Items are stored in one of two envelope formats:
 
-        {"shard_idx": int, "chunk_idx": int, "item": <user_data>}
+    * **Native** (``is_pickled=False``): ``{"shard_idx", "chunk_idx", "item": <data>}``
+    * **Pickle** (``is_pickled=True``): ``{"shard_idx", "chunk_idx", "pickled": <bytes>}``
 
-    The ``read`` method filters by shard/chunk and unwraps the ``item`` field.
-    Predicate pushdown in Parquet skips irrelevant row groups, so each
-    reducer reads only its own data efficiently.
+    The pickle envelope is used when items are not Arrow-serializable.
     """
 
     path: str
     filter_shard: int
     filter_chunk: int
     count: int
+    is_pickled: bool = False
 
     def __iter__(self) -> Iterator:
         return iter(self.read())
 
     def read(self) -> list:
         """Load filtered chunk data from a Parquet file, unwrapping envelope."""
+        col = _ZEPHYR_SHUFFLE_PICKLED_COL if self.is_pickled else _ZEPHYR_SHUFFLE_ITEM_COL
         table = pq.read_table(
             self.path,
-            columns=[_ZEPHYR_SHUFFLE_ITEM_COL],
+            columns=[col],
             filters=(
                 (pc.field(_ZEPHYR_SHUFFLE_SHARD_IDX_COL) == self.filter_shard)
                 & (pc.field(_ZEPHYR_SHUFFLE_CHUNK_IDX_COL) == self.filter_chunk)
             ),
         )
-        return table.column(_ZEPHYR_SHUFFLE_ITEM_COL).to_pylist()
+        items = table.column(col).to_pylist()
+        if self.is_pickled:
+            return [pickle.loads(b) for b in items]
+        return items
 
 
 @dataclass
@@ -225,6 +230,18 @@ def _make_envelope(items: list, target_shard: int, chunk_idx: int) -> list[dict]
     ]
 
 
+def _make_pickle_envelope(items: list, target_shard: int, chunk_idx: int) -> list[dict]:
+    """Wrap items as pickle-serialized bytes for Arrow-incompatible types."""
+    return [
+        {
+            _ZEPHYR_SHUFFLE_SHARD_IDX_COL: target_shard,
+            _ZEPHYR_SHUFFLE_CHUNK_IDX_COL: chunk_idx,
+            _ZEPHYR_SHUFFLE_PICKLED_COL: cloudpickle.dumps(item),
+        }
+        for item in items
+    ]
+
+
 def _segment_path(base_path: str, seg_idx: int) -> str:
     """Return the file path for a given segment index.
 
@@ -245,13 +262,17 @@ def _write_parquet_scatter(
     stage_gen: Iterator[StageResultChunk],
     source_shard: int,
     parquet_path: str,
+    pickled: bool = False,
 ) -> list[ResultChunk]:
     """Stream scatter chunks into Parquet files as row groups.
 
     Writes batches to a Parquet file until a schema mismatch is detected
     (e.g. a field evolves from null to a concrete type). On mismatch the
     current file is closed, the schema is unified via ``pa.unify_schemas``,
     and a new segment file is opened with the evolved schema.
+
+    When ``pickled=True``, items are serialized via pickle into a binary
+    ``pickled`` column instead of being stored natively in the ``item`` column.
     """
     chunk_results: list[_ChunkMetadata] = []
     per_shard_chunk_cnt: dict[int, int] = defaultdict(int)
@@ -285,7 +306,8 @@ def _flush_pending():
         target_shard = result.target_shard
         shard_chunk_idx = per_shard_chunk_cnt[target_shard]
         per_shard_chunk_cnt[target_shard] += 1
-        envelope = _make_envelope(chunk_items, target_shard, shard_chunk_idx)
+        envelope_fn = _make_pickle_envelope if pickled else _make_envelope
+        envelope = envelope_fn(chunk_items, target_shard, shard_chunk_idx)
         chunk_arrow = pa.RecordBatch.from_pylist(envelope)
 
         if schema is None:
@@ -328,7 +350,11 @@ def _flush_pending():
             source_shard=source_shard,
             target_shard=rec.target_shard,
             data=ParquetDiskChunk(
-                path=rec.path, filter_shard=rec.target_shard, filter_chunk=rec.chunk_idx, count=rec.cnt
+                path=rec.path,
+                filter_shard=rec.target_shard,
+                filter_chunk=rec.chunk_idx,
+                count=rec.cnt,
+                is_pickled=pickled,
             ),
         )
         for rec in chunk_results
@@ -387,33 +413,26 @@ def _write_stage_chunks(
 
     first_items = list(first_result.chunk)
 
-    # Test Arrow serializability on the first chunk to decide parquet vs pickle
-    use_parquet = False
+    # Prepend the already-consumed first result back into the stream
+    first_with_materialized_chunk = dataclasses.replace(first_result, chunk=first_items)
+    full_gen = itertools.chain([first_with_materialized_chunk], stage_gen)
+
     if is_scatter:
+        # Test Arrow serializability on the first chunk to decide native vs pickle envelope
+        use_pickle_envelope = False
         try:
             test_envelope = _make_envelope(first_items, 0, 0)
             pa.RecordBatch.from_pylist(test_envelope)
-            use_parquet = True
             logger.info("Using Parquet for scatter serialization for shard %d", source_shard)
         except Exception:
-            sample_rows = str(test_envelope[:5]) if len(test_envelope) > 5 else str(test_envelope)
-            if len(sample_rows) > 1000:
-                sample_rows = sample_rows[:1000] + "...(truncated)"
-            logger.warning(
-                "Arrow scatter serialization failed for shard %d; "
-                "falling back to pickle. Performance will be degraded. Sample rows: %s",
+            use_pickle_envelope = True
+            logger.info(
+                "Using Parquet with pickle envelope for scatter serialization for shard %d",
                 source_shard,
-                sample_rows,
-                exc_info=True,
             )
 
-    # Prepend the already-consumed first result back into the stream
-    first_with_materialized_chunk = dataclasses.replace(first_result, chunk=first_items)
-    full_gen = itertools.chain([first_with_materialized_chunk], stage_gen)
-
-    if use_parquet:
         parquet_path = f"{stage_dir}/shard-{shard_idx:04d}.parquet"
-        return _write_parquet_scatter(full_gen, source_shard, parquet_path)
+        return _write_parquet_scatter(full_gen, source_shard, parquet_path, pickled=use_pickle_envelope)
 
     def chunk_path_fn(idx: int) -> str:
         return f"{stage_dir}/shard-{shard_idx:04d}/chunk-{idx:04d}.pkl"
diff --git a/lib/zephyr/tests/test_groupby.py b/lib/zephyr/tests/test_groupby.py
@@ -330,16 +330,16 @@ def reducer(key, items):
 
 
 def test_group_by_non_vortex_serializable(zephyr_ctx):
-    """Shuffle with items that Vortex/Arrow cannot serialize falls back to pickle.
+    """Shuffle with items that Vortex/Arrow cannot serialize uses pickle-in-parquet.
 
-    Uses SimpleNamespace (not a dict) so Arrow conversion fails and the pickle
-    fallback is exercised.  SimpleNamespace is a stdlib type importable by any
-    worker process, avoiding module-resolution issues with test-local classes.
+    Uses frozenset (not Arrow-serializable) so the pickle envelope path is
+    exercised. Items are serialized via cloudpickle into a binary ``__pickle__``
+    column inside Parquet, avoiding the N*M pickle file blowup.
     """
 
     from zephyr.writers import infer_arrow_schema
 
-    # NOTE: confirm frozenset is not arrow-serializable type to trigger the fallback path
+    # NOTE: confirm frozenset is not arrow-serializable type to trigger the pickle envelope path
     with pytest.raises(pa.lib.ArrowInvalid, match="Could not convert frozenset"):
         infer_arrow_schema([{"foo": frozenset([1, 2, 3])}])
 
@@ -361,6 +361,27 @@ def test_group_by_non_vortex_serializable(zephyr_ctx):
     assert results[1] == {"key": "b", "value": frozenset([2])}
 
 
+def test_parquet_disk_chunk_pickle_roundtrip(tmp_path):
+    """ParquetDiskChunk with is_pickled=True round-trips non-Arrow-serializable items."""
+    import pyarrow.parquet as pq
+
+    from zephyr.execution import (
+        ParquetDiskChunk,
+        _make_pickle_envelope,
+    )
+
+    items = [frozenset([1, 2]), frozenset([3, 4, 5])]
+    envelope = _make_pickle_envelope(items, target_shard=0, chunk_idx=0)
+    batch = pa.RecordBatch.from_pylist(envelope)
+
+    path = str(tmp_path / "test.parquet")
+    pq.write_table(pa.Table.from_batches([batch]), path)
+
+    chunk = ParquetDiskChunk(path=path, filter_shard=0, filter_chunk=0, count=2, is_pickled=True)
+    result = chunk.read()
+    assert result == items
+
+
 def test_group_by_schema_evolution(zephyr_ctx):
     """Schema evolution: a field that is null in some chunks gains a type in others."""
     data = []