[zephyr] Add SpillWriter and TableAccumulator, refactor scatter + external sort

rjpower · rjpower · commit 8b8b9bfe670a · 2026-04-01T14:28:39.000-07:00
SpillWriter: byte-budgeted ParquetWriter wrapper with background write
thread for GCS upload overlap. Uses zstd-1 compression (was snappy for
spills). Two modes: write_table() accumulates and auto-flushes row groups,
write_row_group() writes immediately (for scatter where each chunk must be
a separate row group).

TableAccumulator: byte-budgeted Arrow table batching, replacing the
row-count-based _MERGE_OUTPUT_BATCH_SIZE in the k-way merge output.

Refactors:
- external_sort: _write_spill_file uses SpillWriter (was binary-search
  row-group sizing), merge output uses TableAccumulator
- shuffle: _write_parquet_scatter uses SpillWriter (was manual
  ParquetWriter + pending_chunk + _flush_pending)
diff --git a/lib/iris/src/iris/actor/client.py b/lib/iris/src/iris/actor/client.py
@@ -59,8 +59,8 @@ def __init__(
         resolver: Resolver,
         name: str,
         call_timeout: float | None = None,
-        max_call_attempts: int = 5,
-        backoff: ExponentialBackoff = ExponentialBackoff(initial=0.1, maximum=10.0, factor=2.0, jitter=0.25),
+        max_call_attempts: int = 10,
+        backoff: ExponentialBackoff = ExponentialBackoff(initial=0.5, maximum=10.0, factor=2.0, jitter=0.25),
     ):
         """Initialize the actor client.
 
diff --git a/lib/iris/src/iris/client/client.py b/lib/iris/src/iris/client/client.py
@@ -555,7 +555,7 @@ def submit(
         coscheduling: CoschedulingConfig | None = None,
         replicas: int = 1,
         max_retries_failure: int = 0,
-        max_retries_preemption: int = 100,
+        max_retries_preemption: int = 10_000,
         timeout: Duration | None = None,
         user: str | None = None,
         reservation: list[ReservationEntry] | None = None,
diff --git a/lib/iris/src/iris/cluster/controller/scaling_group.py b/lib/iris/src/iris/cluster/controller/scaling_group.py
@@ -98,8 +98,8 @@ class AvailabilityState:
     until: Timestamp | None = None
 
 
-DEFAULT_SCALE_UP_RATE_LIMIT = 5  # per minute
-DEFAULT_SCALE_DOWN_RATE_LIMIT = 5  # per minute
+DEFAULT_SCALE_UP_RATE_LIMIT = 32  # per minute
+DEFAULT_SCALE_DOWN_RATE_LIMIT = 32  # per minute
 DEFAULT_SCALE_UP_COOLDOWN = Duration.from_minutes(1)
 DEFAULT_BACKOFF_INITIAL = Duration.from_minutes(5)
 DEFAULT_BACKOFF_MAX = Duration.from_minutes(15)
diff --git a/lib/iris/src/iris/cluster/providers/gcp/bootstrap.py b/lib/iris/src/iris/cluster/providers/gcp/bootstrap.py
@@ -136,9 +136,11 @@ def replace_var(match: re.Match) -> str:
 sudo systemctl start docker || true
 
 # Tune network stack for high-connection workloads (#3066).
-# Expands ephemeral port range and allows reuse of TIME_WAIT sockets.
+# Expands ephemeral port range, allows reuse of TIME_WAIT sockets,
+# and raises listen backlog for actor servers handling 1000s of workers.
 sudo sysctl -w net.ipv4.ip_local_port_range="1024 65535"
 sudo sysctl -w net.ipv4.tcp_tw_reuse=1
+sudo sysctl -w net.core.somaxconn=4096
 
 # Create cache directory
 sudo mkdir -p {{ cache_dir }}
diff --git a/lib/iris/src/iris/cluster/runtime/docker.py b/lib/iris/src/iris/cluster/runtime/docker.py
@@ -618,6 +618,8 @@ def _docker_create(
             "create",
             "--ulimit",
             "core=0:0",
+            "--ulimit",
+            "nofile=65536:524288",
             "-w",
             config.workdir,
         ]
diff --git a/lib/zephyr/src/zephyr/external_sort.py b/lib/zephyr/src/zephyr/external_sort.py
@@ -30,6 +30,8 @@
 import pyarrow.parquet as pq
 from iris.env_resources import TaskResources as _TaskResources
 
+from zephyr.spill_writer import SpillWriter, TableAccumulator
+
 logger = logging.getLogger(__name__)
 
 # Fraction of worker memory available for sort (pass 1 and pass 2 are
@@ -43,9 +45,6 @@
 # memory during merge, so this controls per-run memory footprint.
 _SPILL_ROW_GROUP_TARGET_BYTES = 8 * 1024 * 1024  # 8 MB
 
-# Output batch size yielded from the merge.
-_MERGE_OUTPUT_BATCH_SIZE = 100_000
-
 
 @dataclass
 class _SortBudget:
@@ -79,24 +78,8 @@ def _compute_budget(chunk_bytes: int) -> _SortBudget:
 
 def _write_spill_file(table: pa.Table, path: str) -> None:
     """Write a sorted table as a Parquet file with byte-budgeted row groups."""
-    writer = pq.ParquetWriter(path, table.schema)
-    offset = 0
-    n = len(table)
-    while offset < n:
-        # Grow the row group until we hit the byte target.
-        # Double the slice size each probe to keep overhead O(log n).
-        lo = offset + 1
-        hi = n
-        while lo < hi:
-            mid = (lo + hi + 1) // 2
-            if table.slice(offset, mid - offset).nbytes <= _SPILL_ROW_GROUP_TARGET_BYTES:
-                lo = mid
-            else:
-                hi = mid - 1
-        rg_end = lo
-        writer.write_table(table.slice(offset, rg_end - offset))
-        offset = rg_end
-    writer.close()
+    with SpillWriter(path, table.schema, row_group_bytes=_SPILL_ROW_GROUP_TARGET_BYTES) as w:
+        w.write_table(table)
 
 
 def _promote_to_large_string(table: pa.Table) -> pa.Table:
@@ -215,8 +198,7 @@ def _streaming_k_way_merge(
     for src in sources:
         heapq.heappush(heap, _MergeEntry(src.current_sort_value(), src.idx, src))
 
-    output_chunks: list[pa.Table] = []
-    output_rows = 0
+    accumulator = TableAccumulator(_SPILL_ROW_GROUP_TARGET_BYTES)
 
     while heap:
         entry = heapq.heappop(heap)
@@ -229,19 +211,17 @@ def _streaming_k_way_merge(
             take_count = winner.remaining()
 
         chunk = winner.take(take_count)
-        output_chunks.append(chunk)
-        output_rows += len(chunk)
 
         if winner.has_data:
             heapq.heappush(heap, _MergeEntry(winner.current_sort_value(), winner.idx, winner))
 
-        if output_rows >= _MERGE_OUTPUT_BATCH_SIZE:
-            yield pa.concat_tables(output_chunks, promote_options="default")
-            output_chunks.clear()
-            output_rows = 0
+        merged = accumulator.add(chunk)
+        if merged is not None:
+            yield merged
 
-    if output_chunks:
-        yield pa.concat_tables(output_chunks, promote_options="default")
+    remaining = accumulator.flush()
+    if remaining is not None:
+        yield remaining
 
 
 def external_sort_merge(
diff --git a/lib/zephyr/src/zephyr/plan.py b/lib/zephyr/src/zephyr/plan.py
@@ -27,7 +27,7 @@
 from iris.env_resources import TaskResources as _TaskResources
 from rigging.filesystem import url_to_fs
 
-from zephyr.external_sort import external_sort_merge
+from zephyr.external_sort import _promote_to_large_string, external_sort_merge
 
 from zephyr.dataset import (
     Dataset,
@@ -211,7 +211,7 @@ def _arrow_merge_sorted_chunks(shard: Any) -> pa.Table:
             all_tables.append(table)
     if not all_tables:
         return pa.table({})
-    combined = pa.concat_tables(all_tables, promote_options="default")
+    combined = pa.concat_tables([_promote_to_large_string(t) for t in all_tables], promote_options="default")
     sort_keys: list[tuple[str, str]] = [(_ZEPHYR_SORT_KEY, "ascending")]
     if _ZEPHYR_SORT_SECONDARY in combined.column_names:
         sort_keys.append((_ZEPHYR_SORT_SECONDARY, "ascending"))
@@ -259,22 +259,42 @@ def _chunk_tables() -> Iterator[pa.Table]:
             for it in shard.iterators:
                 yield from it.get_chunk_tables()
 
-        all_tables = list(external_sort_merge(_chunk_tables(), sort_keys, external_sort_dir))
-        if not all_tables:
-            return
-        sorted_table = pa.concat_tables(all_tables, promote_options="default")
+        # Stream through the merge, grouping by sort key across batch boundaries.
+        # Only one batch + one group's accumulated rows are in memory at a time.
+        is_gen = inspect.isgeneratorfunction(reducer_fn)
+        current_key = None
+        current_group_tables: list[pa.Table] = []
 
-        key_col = sorted_table.column(_ZEPHYR_SORT_KEY)
-        pickled = _ZEPHYR_PAYLOAD in sorted_table.column_names
+        for batch_table in external_sort_merge(_chunk_tables(), sort_keys, external_sort_dir):
+            pickled = _ZEPHYR_PAYLOAD in batch_table.column_names
+            key_col = batch_table.column(_ZEPHYR_SORT_KEY)
 
-        is_gen = inspect.isgeneratorfunction(reducer_fn)
-        for start, end, key_value in _find_group_boundaries(key_col):
-            group_table = sorted_table.slice(start, end - start)
+            for start, end, key_value in _find_group_boundaries(key_col):
+                group_slice = batch_table.slice(start, end - start)
+
+                if current_key is None:
+                    current_key = key_value
+                    current_group_tables = [group_slice]
+                elif key_value == current_key:
+                    current_group_tables.append(group_slice)
+                else:
+                    group_table = pa.concat_tables(current_group_tables, promote_options="default")
+                    group_items = unwrap_items(group_table, pickled)
+                    if is_gen:
+                        yield from reducer_fn(current_key, iter(group_items))
+                    else:
+                        yield reducer_fn(current_key, iter(group_items))
+                    current_key = key_value
+                    current_group_tables = [group_slice]
+
+        if current_group_tables:
+            group_table = pa.concat_tables(current_group_tables, promote_options="default")
+            pickled = _ZEPHYR_PAYLOAD in group_table.column_names
             group_items = unwrap_items(group_table, pickled)
             if is_gen:
-                yield from reducer_fn(key_value, iter(group_items))
+                yield from reducer_fn(current_key, iter(group_items))
             else:
-                yield reducer_fn(key_value, iter(group_items))
+                yield reducer_fn(current_key, iter(group_items))
         return
 
     sorted_table = _arrow_merge_sorted_chunks(shard)
diff --git a/lib/zephyr/src/zephyr/shuffle.py b/lib/zephyr/src/zephyr/shuffle.py
@@ -28,12 +28,12 @@
 import pyarrow as pa
 import pyarrow.compute as pc
 import pyarrow.dataset as pad
-import pyarrow.parquet as pq
 from iris.env_resources import TaskResources as _TaskResources
 from rigging.filesystem import open_url, url_to_fs
 from rigging.timing import log_time
 
 from zephyr.plan import deterministic_hash
+from zephyr.spill_writer import SpillWriter
 from zephyr.writers import ensure_parent_dir
 
 logger = logging.getLogger(__name__)
@@ -571,12 +571,7 @@ def _write_parquet_scatter(
     seg_idx = 0
     seg_paths: list[str] = []
     schema: pa.Schema | None = None
-    writer: pq.ParquetWriter | None = None
-    seg_file = ""
-
-    pending_chunk: pa.RecordBatch | None = None
-    pending_target: int = -1
-    pending_cnt: int = 0
+    spill_writer: SpillWriter | None = None
 
     avg_item_bytes: float = 0.0
     _sampled_avg = False
@@ -586,57 +581,36 @@ def _get_buffer(target: int) -> _ShardBuffer:
             buffers[target] = _ShardBuffer(shard_idx=target, pickled=pickled, has_sort=sort_fn is not None)
         return buffers[target]
 
-    def _flush_pending() -> None:
-        nonlocal n_chunks_flushed, pending_chunk
-        if pending_chunk is None:
-            return
-        writer.write_batch(pending_chunk)
-        seg_shard_counts[seg_idx][pending_target] = seg_shard_counts[seg_idx].get(pending_target, 0) + 1
-        n_chunks_flushed += 1
-        pending_chunk = None
-        if n_chunks_flushed % 10 == 0:
-            logger.info(
-                "[shard %d segment %d] Wrote %d parquet chunks so far (latest chunk size: %d items)",
-                source_shard,
-                seg_idx,
-                n_chunks_flushed,
-                pending_cnt,
-            )
-
     def _ensure_writer(chunk_schema: pa.Schema) -> pa.Schema:
-        nonlocal schema, writer, seg_file, seg_idx
+        nonlocal schema, spill_writer, seg_idx
         if schema is None:
             schema = chunk_schema
             seg_file = _segment_path(parquet_path, seg_idx)
             seg_paths.append(seg_file)
             ensure_parent_dir(seg_file)
-            writer = pq.ParquetWriter(seg_file, schema, compression="zstd", compression_level=1)
+            spill_writer = SpillWriter(seg_file, schema)
         elif chunk_schema != schema:
-            _flush_pending()
-            writer.close()
+            spill_writer.close()
             schema = pa.unify_schemas([schema, chunk_schema])
             seg_idx += 1
             for buf in buffers.values():
                 buf.chunk_idx = 0
             seg_file = _segment_path(parquet_path, seg_idx)
             seg_paths.append(seg_file)
             ensure_parent_dir(seg_file)
-            writer = pq.ParquetWriter(seg_file, schema, compression="zstd", compression_level=1)
+            spill_writer = SpillWriter(seg_file, schema)
             logger.info(
                 "[shard %d] Schema evolved after %d chunks; starting segment %d",
                 source_shard,
                 n_chunks_flushed,
                 seg_idx,
             )
-        else:
-            _flush_pending()
         return schema
 
     def _flush_buffer(buf: _ShardBuffer) -> None:
-        nonlocal pending_chunk, pending_target, pending_cnt, avg_item_bytes, _sampled_avg
+        nonlocal n_chunks_flushed, avg_item_bytes, _sampled_avg
 
         if combiner_fn is not None:
-            # Combiner path: drain buffer to Python, apply combiner, re-sort in Arrow
             buf._flush_micro()
             if not buf.tables:
                 return
@@ -664,9 +638,21 @@ def _flush_buffer(buf: _ShardBuffer) -> None:
         write_schema = _ensure_writer(batch.schema)
         if batch.schema != write_schema:
             batch = batch.cast(write_schema)
-        pending_chunk = batch
-        pending_target = buf.shard_idx
-        pending_cnt = len(batch)
+
+        # Each sorted chunk is its own row group (distinct shard/chunk metadata).
+        batch_table = pa.Table.from_batches([batch])
+        spill_writer.write_row_group(batch_table)
+        seg_shard_counts[seg_idx][buf.shard_idx] = seg_shard_counts[seg_idx].get(buf.shard_idx, 0) + 1
+        n_chunks_flushed += 1
+
+        if n_chunks_flushed % 10 == 0:
+            logger.info(
+                "[shard %d segment %d] Wrote %d parquet chunks so far (latest chunk size: %d items)",
+                source_shard,
+                seg_idx,
+                n_chunks_flushed,
+                len(batch),
+            )
 
         if not _sampled_avg and len(batch) > 0:
             avg_item_bytes = batch.nbytes / len(batch)
@@ -682,16 +668,14 @@ def _flush_buffer(buf: _ShardBuffer) -> None:
             _flush_buffer(buf)
 
     with log_time(f"Flushing remaining buffers for {parquet_path}"):
-        _flush_pending()
         for target in sorted(buffers.keys()):
             buf = buffers[target]
             if buf.item_count == 0:
                 continue
             _flush_buffer(buf)
-        _flush_pending()
 
-    if writer is not None:
-        writer.close()
+    if spill_writer is not None:
+        spill_writer.close()
 
     per_shard_max_rows: dict[int, int] = {target: buf.max_rows for target, buf in buffers.items() if buf.max_rows > 0}
 
diff --git a/lib/zephyr/src/zephyr/spill_writer.py b/lib/zephyr/src/zephyr/spill_writer.py
diff --git a/lib/zephyr/tests/test_shuffle.py b/lib/zephyr/tests/test_shuffle.py

Original file line number	Diff line number	Diff line change
`@@ -618,6 +618,8 @@ def _docker_create(`
`618`	`618`	`"create",`
`619`	`619`	`"--ulimit",`
`620`	`620`	`"core=0:0",`
	`621`	`+ "--ulimit",`
	`622`	`+ "nofile=65536:524288",`
`621`	`623`	`"-w",`
`622`	`624`	`config.workdir,`
`623`	`625`	`]`