zephyr: address review — ScatterWriter/Reader classes, cloudpickle, exact memory budget

ravwojdyla · ravwojdyla · commit 8ca176846031 · 2026-04-17T00:01:09.000Z
Review feedback from rjpower on #4782: - cloudpickle on write: `_write_chunk_frame` now uses `cloudpickle.dump` so lambdas, local classes, and dynamically-defined callables survive scatter serialization. Read side stays stdlib `pickle.load`. - ScatterWriter class: extracts `_write_scatter` into a class with `write(item)` / `close()` / context manager. Nonlocal state becomes instance attributes. `_write_scatter` remains as a thin wrapper. - ScatterReader class: collapses `ScatterShard` + `ScatterFileIterator` + `_build_scatter_shard_from_manifest`. `ScatterShard = ScatterReader` alias preserves backward compat for isinstance checks. `ScatterReader.from_manifest()` classmethod replaces the factory fn. - url_to_fs resolved once per file: `ScatterFileIterator.__post_init__` resolves (fs, fs_path) once; `_iter_chunk` receives them directly. - Memory budget from chunk compressed sizes: fan-in computation uses `max_compressed_chunk_bytes` (exact from sidecar) instead of `max_chunk_rows * avg_item_bytes` heuristic. `avg_item_bytes` kept only for `compute_write_batch_size`. - Removed stats-unavailable fallback in plan.py: stats are always written by the scatter writer; the dead-code fallback path is gone.
diff --git a/lib/zephyr/src/zephyr/execution.py b/lib/zephyr/src/zephyr/execution.py
@@ -111,7 +111,9 @@ def read(self) -> list:
 from zephyr.shuffle import (  # noqa: E402
     ListShard,
     MemChunk,
-    ScatterShard,  # noqa: F401 — re-exported for plan.py and external callers
+    ScatterReader,  # noqa: F401 — re-exported for plan.py and external callers
+    ScatterShard,  # noqa: F401 — backward-compat alias for ScatterReader
+    ScatterWriter,  # noqa: F401 — re-exported for external callers
     _build_scatter_shard_from_manifest,  # noqa: F401 — re-exported for plan.py
     _write_scatter,
     _write_scatter_manifest,
diff --git a/lib/zephyr/src/zephyr/plan.py b/lib/zephyr/src/zephyr/plan.py
@@ -25,7 +25,7 @@
 from iris.env_resources import TaskResources as _TaskResources
 from rigging.filesystem import url_to_fs
 
-from zephyr.external_sort import EXTERNAL_SORT_FAN_IN, external_sort_merge
+from zephyr.external_sort import external_sort_merge
 
 from zephyr.dataset import (
     Dataset,
@@ -635,7 +635,7 @@ def merge_key(item):
 
     # Check if external sort is needed BEFORE materializing all iterators.
     # ScatterShard can decide using manifest stats (no file opens needed).
-    from zephyr.shuffle import ScatterShard
+    from zephyr.shuffle import ScatterShard  # ScatterShard is an alias for ScatterReader
 
     use_external = (
         external_sort_dir is not None
@@ -648,10 +648,8 @@ def merge_key(item):
 
         memory_limit = _TaskResources.from_environment().memory_bytes
         # Per-iterator memory ~= compressed bytes for one chunk held by
-        # cat_file. Use uncompressed (max_chunk_rows * avg_item_bytes) as a
-        # conservative upper bound — scatter writes ASCII-ish data with
-        # mediocre zstd ratio.
-        per_iter_bytes = int(shard.max_chunk_rows * shard.avg_item_bytes)
+        # cat_file. Use the actual max compressed chunk size from the sidecar.
+        per_iter_bytes = shard.max_compressed_chunk_bytes
         fan_in = compute_fan_in(per_iter_bytes, memory_limit)
         write_batch_size = compute_write_batch_size(shard.avg_item_bytes)
         logger.info(
@@ -674,11 +672,7 @@ def merge_key(item):
     else:
         chunk_iterators = list(shard.get_iterators())
         logger.info(f"Merging {len(chunk_iterators):,} sorted chunk iterators")
-        if external_sort_dir is not None and len(chunk_iterators) > EXTERNAL_SORT_FAN_IN:
-            # Fallback: stats unavailable, use the hard-cap fan-in.
-            merged_stream = external_sort_merge(iter(chunk_iterators), merge_key, external_sort_dir)
-        else:
-            merged_stream = heapq.merge(*chunk_iterators, key=merge_key)
+        merged_stream = heapq.merge(*chunk_iterators, key=merge_key)
     yield from groupby(merged_stream, key=key_fn)
 
 
diff --git a/lib/zephyr/src/zephyr/shuffle.py b/lib/zephyr/src/zephyr/shuffle.py