zephyr: fix frozen size estimate — EMA update in write() not just flush()

hsuhanooi · hsuhanooi · commit b2b4c13aea08 · 2026-04-23T00:03:13.000+02:00
The previous flush-time EMA was a closed loop: if the estimate was too low
no flush fired, so the EMA never ran, and the estimate stayed low. Skewed
datasets (small items early, large items later) could accumulate unbounded
memory without any flush triggering.

Fix: sample one item's pickle size every 10 writes and apply EMA directly in
write(), independent of whether any flush has occurred. The flush-time sample
(100 items first flush, 10 items ongoing) still runs for higher-quality
multi-item measurements when flushes do happen.

Adds test that confirms mid-write flushes fire when large items arrive after
a run of small items.
diff --git a/lib/zephyr/src/zephyr/shuffle.py b/lib/zephyr/src/zephyr/shuffle.py
@@ -98,8 +98,17 @@ def get_iterators(self) -> Iterator[Iterator]:
 # ScatterReader. Sidecars are small msgpack files (a few KB) and reads are
 # GCS GET-bound, so a modest pool keeps latency low without thrashing.
 _SIDECAR_READ_CONCURRENCY = 32
-# Number of items sampled from the first flush to estimate avg_item_bytes.
+# Items sampled on the first flush to establish an avg_item_bytes baseline.
 _SCATTER_SAMPLE_SIZE = 100
+# Items sampled on each subsequent flush to track item-size drift cheaply.
+_SCATTER_ONGOING_SAMPLE_SIZE = 10
+# How often (in items written) to re-sample one item's pickle size and update
+# the EMA estimate in write(). This is independent of flush-time sampling and
+# ensures the estimate tracks drift even when no flush has fired yet.
+_ESTIMATE_WRITE_SAMPLE_INTERVAL = 10
+# EMA weight given to each new observation. 0.3 converges to a 2x step-change
+# in item size within ~3 samples while staying stable under small fluctuations.
+_ESTIMATE_EMA_ALPHA = 0.3
 # Fraction of total memory budgeted for read-side decompression buffers.
 _SCATTER_READ_BUFFER_FRACTION = 0.25
 
@@ -532,12 +541,21 @@ def _flush(self, target: int, buf: list) -> None:
             buf = _apply_combiner(buf, self._key_fn, self._combiner_fn)
         buf.sort(key=self._sort_key)
 
-        if not self._sampled_avg and buf:
-            sample = buf[: min(len(buf), _SCATTER_SAMPLE_SIZE)]
-            total_bytes = sum(len(pickle.dumps(item, protocol=pickle.HIGHEST_PROTOCOL)) for item in sample)
-            self._avg_item_bytes = total_bytes / len(sample)
+        if buf:
+            # Sample a subset of the buffer to update the byte-size estimate.
+            # First flush: larger sample for a good baseline. Subsequent flushes:
+            # smaller sample to track drift cheaply via EMA. This prevents OOM
+            # when early items are small but later items are large — the estimate
+            # stays current rather than being frozen at the first-flush value.
+            n = _SCATTER_SAMPLE_SIZE if not self._sampled_avg else _SCATTER_ONGOING_SAMPLE_SIZE
+            sample = buf[: min(len(buf), n)]
+            observed = sum(len(pickle.dumps(item, protocol=pickle.HIGHEST_PROTOCOL)) for item in sample) / len(sample)
+            if not self._sampled_avg:
+                self._avg_item_bytes = observed
+                self._sampled_avg = True
+            else:
+                self._avg_item_bytes = (1 - _ESTIMATE_EMA_ALPHA) * self._avg_item_bytes + _ESTIMATE_EMA_ALPHA * observed
             self._item_bytes_estimate = self._avg_item_bytes
-            self._sampled_avg = True
 
         frame = _write_chunk_frame(buf)
         offset = self._out.tell()
@@ -557,13 +575,21 @@ def _flush(self, target: int, buf: list) -> None:
 
     def write(self, item: Any) -> None:
         """Route a single item to its target shard buffer, flushing when over budget."""
-        if self._total_buffer_rows == 0:
-            # Calibrate from the first item before any batching occurs. A
-            # hardcoded default (e.g. 512 B) can be orders of magnitude off for
-            # large documents, allowing millions of rows to accumulate before the
-            # first flush fires. One real measurement is far safer.
-            self._item_bytes_estimate = float(len(pickle.dumps(item, protocol=pickle.HIGHEST_PROTOCOL)))
-            self._first_item_bytes = self._item_bytes_estimate
+        if self._total_buffer_rows % _ESTIMATE_WRITE_SAMPLE_INTERVAL == 0:
+            # Periodically measure a single item's serialised size and apply EMA.
+            # This runs in write() — not just in _flush() — so the estimate tracks
+            # size drift even when no flush has fired yet (the flush EMA is a
+            # closed loop: if the estimate is too low no flush fires, so it never
+            # updates). Interval-based sampling amortises the pickle.dumps cost
+            # to 1-in-10 items while still catching step-changes within a few rows.
+            observed = float(len(pickle.dumps(item, protocol=pickle.HIGHEST_PROTOCOL)))
+            if self._total_buffer_rows == 0:
+                self._item_bytes_estimate = observed
+                self._first_item_bytes = observed
+            else:
+                self._item_bytes_estimate = (
+                    1 - _ESTIMATE_EMA_ALPHA
+                ) * self._item_bytes_estimate + _ESTIMATE_EMA_ALPHA * observed
 
         key = self._key_fn(item)
         target = deterministic_hash(key) % self._num_output_shards
diff --git a/lib/zephyr/tests/test_shuffle.py b/lib/zephyr/tests/test_shuffle.py
@@ -200,6 +200,41 @@ def test_scatter_byte_budget_flushes_mid_write(tmp_path):
     assert total_chunks > 2, f"expected >2 chunks with 1-byte budget, got {total_chunks}"
 
 
+def test_scatter_estimate_tracks_skewed_items(tmp_path):
+    """Estimate updates after each flush so large late items still trigger budget flushes."""
+    num_shards = 1
+    data_path = str(tmp_path / "shard-0000.shuffle")
+
+    # Start with tiny items, then switch to large items. With a frozen estimate
+    # the budget check would never fire for the large items. With EMA updates it
+    # should: _item_bytes_estimate rises and eventually exceeds budget / rows.
+    small_items = [{"k": 0, "v": "x"} for _ in range(50)]
+    large_items = [{"k": 0, "v": "y" * 50_000} for _ in range(10)]
+
+    # Budget large enough that small items alone never flush, but one large
+    # item should push the estimate over threshold quickly.
+    budget = 10_000  # 10 KB — well under 10 * 50 KB large items
+    writer = ScatterWriter(
+        data_path=data_path,
+        key_fn=_key,
+        num_output_shards=num_shards,
+        buffer_limit_bytes=budget,
+    )
+    for item in small_items + large_items:
+        writer.write(item)
+    writer.close()
+
+    # All items must survive the skewed flush pattern.
+    scatter_paths = [data_path]
+    recovered = list(ScatterReader.from_sidecars(scatter_paths, 0))
+    all_items = small_items + large_items
+    assert sorted(recovered, key=lambda x: x["v"]) == sorted(all_items, key=lambda x: x["v"])
+
+    # The estimate must have been updated: mid-write flushes should have fired
+    # for the large items (not just at close).
+    assert writer._mid_write_flushes > 0, "expected mid-write flushes for large items"
+
+
 def test_scatter_byte_budget_preserves_all_items(tmp_path):
     """Items are not lost or duplicated when byte-budget flushes fire mid-write."""
     num_shards = 3