marin-community
diff --git a/‎lib/zephyr/AGENTS.md‎
Lines changed: 1 addition & 1 deletion b/‎lib/zephyr/AGENTS.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎lib/zephyr/src/zephyr/execution.py‎
Lines changed: 7 additions & 21 deletions b/‎lib/zephyr/src/zephyr/execution.py‎
Lines changed: 7 additions & 21 deletions
diff --git a/‎lib/zephyr/src/zephyr/external_sort.py‎
Lines changed: 66 additions & 9 deletions b/‎lib/zephyr/src/zephyr/external_sort.py‎
Lines changed: 66 additions & 9 deletions
diff --git a/‎lib/zephyr/src/zephyr/plan.py‎
Lines changed: 24 additions & 10 deletions b/‎lib/zephyr/src/zephyr/plan.py‎
Lines changed: 24 additions & 10 deletions
@@ -15,7 +15,7 @@ Lazy dataset processing library. Start with the shared instructions in `/AGENTS.
 - `src/zephyr/plan.py` — `compute_plan`, `PhysicalPlan`, operation fusion
 - `src/zephyr/readers.py` — `load_jsonl`, `load_parquet`, `load_vortex`, `InputFileSpec`
 - `src/zephyr/writers.py` — `write_jsonl_file`, `write_parquet_file`, `write_vortex_file`, Levanter cache writer
-- `src/zephyr/shuffle.py` — scatter pipeline internals (`ScatterParquetIterator`, `ScatterShard`, hash-routing, combiner, Parquet envelope)
+- `src/zephyr/shuffle.py` — scatter pipeline internals (`ScatterFileIterator`, `ScatterShard`, hash-routing, combiner, zstd-chunk file format with byte-range sidecar)
 - `src/zephyr/expr.py` — `Expr`, `col`, `lit` for filter expressions
 - `src/zephyr/external_sort.py` — `external_sort_merge` k-way merge of sorted runs
 - `src/zephyr/counters.py` — `increment` / `get_counters` per-worker counter API (`CounterSnapshot` lives in `execution.py`)
 
@@ -35,7 +35,6 @@
 from typing import Any, Protocol
 
 import cloudpickle
-import pyarrow as pa
 from rigging.filesystem import open_url, url_to_fs
 from fray.v2 import ActorConfig, ActorFuture, ActorHandle, Client, ResourceConfig
 from fray.v2.client import JobHandle
@@ -112,10 +111,11 @@ def read(self) -> list:
 from zephyr.shuffle import (  # noqa: E402
     ListShard,
     MemChunk,
-    ScatterShard,  # noqa: F401 — re-exported for plan.py and external callers
+    ScatterReader,  # noqa: F401 — re-exported for plan.py and external callers
+    ScatterShard,  # noqa: F401 — backward-compat alias for ScatterReader
+    ScatterWriter,  # noqa: F401 — re-exported for external callers
     _build_scatter_shard_from_manifest,  # noqa: F401 — re-exported for plan.py
-    _make_envelope,
-    _write_parquet_scatter,
+    _write_scatter,
     _write_scatter_manifest,
     _SCATTER_MANIFEST_NAME,
 )
@@ -232,36 +232,22 @@ def _write_stage_output(
     TaskResult with a ListShard.
     """
     if scatter_op is not None:
-        # Peek first item to test Arrow serializability
         first_item = next(stage_gen, None)
         if first_item is None:
             return TaskResult(shard=ListShard(refs=[]))
 
         full_gen = itertools.chain([first_item], stage_gen)
 
-        use_pickle_envelope = False
-        try:
-            test_envelope = _make_envelope([first_item], 0, 0)
-            pa.RecordBatch.from_pylist(test_envelope)
-            logger.info("Using Parquet for scatter serialization for shard %d", source_shard)
-        except Exception:
-            use_pickle_envelope = True
-            logger.info(
-                "Using Parquet with pickle envelope for scatter serialization for shard %d",
-                source_shard,
-            )
-
         num_output_shards = scatter_op.num_output_shards if scatter_op.num_output_shards > 0 else total_shards
-        parquet_path = f"{stage_dir}/shard-{shard_idx:04d}.parquet"
-        shard = _write_parquet_scatter(
+        data_path = f"{stage_dir}/shard-{shard_idx:04d}.shuffle"
+        shard = _write_scatter(
             full_gen,
             source_shard,
-            parquet_path,
+            data_path,
             key_fn=scatter_op.key_fn,
             num_output_shards=num_output_shards,
             sort_fn=scatter_op.sort_fn,
             combiner_fn=scatter_op.combiner_fn,
-            pickled=use_pickle_envelope,
         )
         return TaskResult(shard=shard)
 
 
@@ -7,9 +7,11 @@
 ``EXTERNAL_SORT_FAN_IN``, to avoid opening O(k) scanners simultaneously and
 exhausting worker memory.
 
-Pass 1: batch the k iterators into groups of EXTERNAL_SORT_FAN_IN, merge each
-group with heapq.merge, and spill items to a run file under
-``{external_sort_dir}/run-{i:04d}.spill`` via :class:`SpillWriter`.
+Pass 1: batch the k iterators into groups of ``fan_in`` (defaulting to
+``EXTERNAL_SORT_FAN_IN`` but typically lowered via :func:`compute_fan_in` to
+fit the worker's memory budget), merge each group with ``heapq.merge``, and
+spill items to a run file under ``{external_sort_dir}/run-{i:04d}.spill`` via
+:class:`SpillWriter`.
 
 Pass 2: heapq.merge over the (much smaller) set of run file iterators.  Each
 iterator streams chunks from its spill file via :class:`SpillReader`; the read
@@ -31,20 +33,64 @@
 
 logger = logging.getLogger(__name__)
 
-# Maximum simultaneous chunk iterators per pass-1 batch.
+# Hard cap on simultaneous chunk iterators per pass-1 batch. Used as the
+# default when the caller cannot estimate per-iterator memory; otherwise
+# ``compute_fan_in`` lowers it to fit within the worker's memory budget.
 EXTERNAL_SORT_FAN_IN = 500
 
-# Items buffered before handing to the SpillWriter. Larger values amortize
-# per-chunk overhead in the spill format.
+# Fraction of worker memory budgeted for the open chunk iterators during a
+# pass-1 merge batch.
+_FAN_IN_MEMORY_FRACTION = 0.5
+
+# Floor on fan-in. Below 2, pass-1 just rewrites each chunk to its own run
+# file with no merging — pass-2 still produces correct output but the extra
+# round-trip is wasteful, so we keep at least a small merge fan-in.
+_FAN_IN_FLOOR = 4
+
+# Default item count per write into the SpillWriter in pass-1. Large enough
+# for good compression + low per-call overhead. For large items (e.g. 1 MB
+# each) the caller should pass a smaller ``write_batch_size`` via
+# :func:`compute_write_batch_size` so the in-memory ``pending`` buffer stays
+# bounded by bytes rather than count.
 _WRITE_BATCH_SIZE = 10_000
 
+# Target bytes for the in-memory pass-1 spill buffer.
+_WRITE_BATCH_TARGET_BYTES = 64 * 1024 * 1024
+
 # Target bytes per spill chunk in pass-1 runs.
 _ROW_GROUP_BYTES = 8 * 1024 * 1024
 
 # Fraction of container memory budgeted for pass-2 read buffers.
 _READ_MEMORY_FRACTION = 0.25
 
 
+def compute_fan_in(per_iterator_bytes: int, memory_limit: int) -> int:
+    """Pick a pass-1 fan-in that fits within the memory budget.
+
+    ``per_iterator_bytes`` is the caller's estimate of memory held per open
+    chunk iterator (typically compressed chunk bytes plus a small decoded
+    buffer). Returns at least ``_FAN_IN_FLOOR`` and at most
+    ``EXTERNAL_SORT_FAN_IN``.
+    """
+    if per_iterator_bytes <= 0 or memory_limit <= 0:
+        return EXTERNAL_SORT_FAN_IN
+    budget = int(memory_limit * _FAN_IN_MEMORY_FRACTION)
+    fan_in = budget // per_iterator_bytes
+    fan_in = max(_FAN_IN_FLOOR, fan_in)
+    return min(fan_in, EXTERNAL_SORT_FAN_IN)
+
+
+def compute_write_batch_size(avg_item_bytes: float) -> int:
+    """Pick a pass-1 pending-buffer size sized to a byte budget.
+
+    Caps at the ``_WRITE_BATCH_SIZE`` default when items are small.
+    """
+    if avg_item_bytes <= 0:
+        return _WRITE_BATCH_SIZE
+    by_bytes = int(_WRITE_BATCH_TARGET_BYTES // avg_item_bytes)
+    return max(1, min(by_bytes, _WRITE_BATCH_SIZE))
+
+
 def _safe_read_batch_size(n_runs: int, sample_run_path: str) -> int:
     """Compute a pass-2 read batch size that fits within the memory budget.
 
@@ -87,16 +133,25 @@ def external_sort_merge(
     chunk_iterators_gen: Iterator[Iterator],  # lazy — consumed in batches
     merge_key: Callable,
     external_sort_dir: str,
+    fan_in: int = EXTERNAL_SORT_FAN_IN,
+    write_batch_size: int = _WRITE_BATCH_SIZE,
 ) -> Iterator:
     """Merge ``chunk_iterators_gen`` via a two-pass external sort.
 
     Args:
         chunk_iterators_gen: Lazy iterator of sorted iterators (one per scatter chunk).
-            Consumed in batches of EXTERNAL_SORT_FAN_IN to avoid opening all file
+            Consumed in batches of ``fan_in`` to avoid opening all file
             handles simultaneously.
         merge_key: Key function passed to heapq.merge.
         external_sort_dir: GCS prefix for spill files, e.g.
             ``gs://bucket/.../stage1-external-sort/shard-0042``.
+        fan_in: Maximum number of chunk iterators to merge in one pass-1
+            batch. Defaults to ``EXTERNAL_SORT_FAN_IN``; callers should pass
+            a value computed by :func:`compute_fan_in` to bound memory.
+        write_batch_size: Item count threshold for the pass-1 ``pending``
+            buffer. Callers should pass a value from
+            :func:`compute_write_batch_size` to keep the buffer bounded by
+            bytes rather than item count.
 
     Yields:
         Items in merged sort order.
@@ -109,8 +164,10 @@ def external_sort_merge(
     spill_fs, spill_dir = url_to_fs(external_sort_dir)
     spill_fs.makedirs(spill_dir, exist_ok=True)
 
+    logger.info("External sort: pass-1 fan_in=%d, write_batch_size=%d", fan_in, write_batch_size)
+
     while True:
-        batch = list(islice(chunk_iterators_gen, EXTERNAL_SORT_FAN_IN))
+        batch = list(islice(chunk_iterators_gen, fan_in))
         if not batch:
             break
         run_path = f"{external_sort_dir}/run-{batch_idx:04d}.spill"
@@ -119,7 +176,7 @@ def external_sort_merge(
         with SpillWriter(run_path, row_group_bytes=_ROW_GROUP_BYTES) as writer:
             for item in heapq.merge(*batch, key=merge_key):
                 pending.append(item)
-                if len(pending) >= _WRITE_BATCH_SIZE:
+                if len(pending) >= write_batch_size:
                     writer.write(pending)
                     item_count += len(pending)
                     pending = []
 
@@ -25,7 +25,7 @@
 from iris.env_resources import TaskResources as _TaskResources
 from rigging.filesystem import url_to_fs
 
-from zephyr.external_sort import EXTERNAL_SORT_FAN_IN, external_sort_merge
+from zephyr.external_sort import external_sort_merge
 
 from zephyr.dataset import (
     Dataset,
@@ -64,7 +64,7 @@ class Shard(Protocol):
 
     Implementations:
     - ListShard: backed by iterable references (source data, non-scatter)
-    - ScatterShard: backed by scatter Parquet files with predicate pushdown
+    - ScatterShard: backed by scatter zstd-chunk files with byte-range sidecar
     """
 
     def __iter__(self) -> Iterator: ...
@@ -635,7 +635,7 @@ def merge_key(item):
 
     # Check if external sort is needed BEFORE materializing all iterators.
     # ScatterShard can decide using manifest stats (no file opens needed).
-    from zephyr.shuffle import ScatterShard
+    from zephyr.shuffle import ScatterShard  # ScatterShard is an alias for ScatterReader
 
     use_external = (
         external_sort_dir is not None
@@ -644,21 +644,35 @@ def merge_key(item):
     )
 
     if use_external:
+        from zephyr.external_sort import compute_fan_in, compute_write_batch_size
+
+        memory_limit = _TaskResources.from_environment().memory_bytes
+        # Per-iterator memory ~= compressed bytes for one chunk held by
+        # cat_file. Use the actual max compressed chunk size from the sidecar.
+        per_iter_bytes = shard.max_compressed_chunk_bytes
+        fan_in = compute_fan_in(per_iter_bytes, memory_limit)
+        write_batch_size = compute_write_batch_size(shard.avg_item_bytes)
         logger.info(
-            "External sort triggered for shard with %d iterators, spilling to %s",
+            "External sort triggered for shard with %d iterators, "
+            "fan_in=%d (per_iter≈%dKB), write_batch_size=%d, spilling to %s",
             sum(it.chunk_count for it in shard.iterators),
+            fan_in,
+            per_iter_bytes // 1024,
+            write_batch_size,
             external_sort_dir,
         )
         # Pass lazy generator — external_sort_merge consumes in batches without opening all files
-        merged_stream = external_sort_merge(shard.get_iterators(), merge_key, external_sort_dir)
+        merged_stream = external_sort_merge(
+            shard.get_iterators(),
+            merge_key,
+            external_sort_dir,
+            fan_in=fan_in,
+            write_batch_size=write_batch_size,
+        )
     else:
         chunk_iterators = list(shard.get_iterators())
         logger.info(f"Merging {len(chunk_iterators):,} sorted chunk iterators")
-        if external_sort_dir is not None and len(chunk_iterators) > EXTERNAL_SORT_FAN_IN:
-            # Fallback: stats unavailable, use fan_in threshold
-            merged_stream = external_sort_merge(iter(chunk_iterators), merge_key, external_sort_dir)
-        else:
-            merged_stream = heapq.merge(*chunk_iterators, key=merge_key)
+        merged_stream = heapq.merge(*chunk_iterators, key=merge_key)
     yield from groupby(merged_stream, key=key_fn)