tokenize: size window + levanter batch from parquet row groups (#5158)

ravwojdyla · claude · web-flow · commit 23ae265eb09d · 2026-04-24T16:40:07.000-07:00
* size zephyr window and levanter cache `batch_size` from parquet row-group metadata so each unit of work aligns with ~half a row group end-to-end * probe first parquet file's footer via `_avg_parquet_row_group_rows`, then set `window = min(avg_rows_per_rg // 2, 64)` and `batch_size = avg_rows_per_rg // 2` * halving gives zephyr headroom to pipeline two windows per row group and caps per-worker peak memory * non-parquet inputs keep the old defaults (`window=64`, `batch_size` from `config.levanter_batch_size`) * caller-supplied `config.levanter_batch_size` still wins over the row-group-derived default * extract `_MAX_WINDOW_SIZE = 64` constant [^1] CC: @rjpower [^1]: rationale for the 64 cap lives in #2829 (comment) Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
diff --git a/lib/marin/src/marin/processing/tokenize/tokenize.py b/lib/marin/src/marin/processing/tokenize/tokenize.py
@@ -19,6 +19,7 @@
 import braceexpand
 import draccus
 import fsspec
+import pyarrow.parquet as pq
 from rigging.filesystem import open_url, url_to_fs
 from datasets import load_dataset_builder
 from fray import ResourceConfig
@@ -44,6 +45,22 @@
 logger = logging.getLogger(__name__)
 
 MIN_GROUP_BYTES = 100_000_000  # 100 MB floor to avoid degenerate tiny shards
+# Empirical upper bound on the zephyr window size (see
+# https://github.com/marin-community/marin/issues/2829#issuecomment-3963661943).
+_MAX_WINDOW_SIZE = 64
+
+
+def _avg_parquet_row_group_rows(path: str) -> int | None:
+    """Return the mean rows-per-row-group from ``path``.
+
+    Returns ``None`` if the file has no row groups (empty parquet footer).
+    """
+    fs, resolved = url_to_fs(path)
+    with fs.open(resolved, "rb") as f:
+        meta = pq.ParquetFile(f).metadata
+    if meta.num_row_groups == 0:
+        return None
+    return max(1, meta.num_rows // meta.num_row_groups)
 
 
 def _compute_target_group_bytes(total_input_bytes: int, max_workers: int) -> int:
@@ -396,22 +413,43 @@ def run_pipeline(ctx: ZephyrContext, file_groups: list[list[str]], split_name: s
         prefix = os.path.join(config.cache_path, split_name)
         pipeline_start = time.monotonic()
 
+        # For parquet sources, align zephyr's window and levanter's cache batch
+        # with the parquet row-group size so each unit of work is exactly one
+        # row group end-to-end. Non-parquet inputs fall through to the defaults.
+        sample_path = next(
+            (p for group in file_groups for p in group if p.endswith(".parquet")),
+            None,
+        )
+        window_size = _MAX_WINDOW_SIZE
+        batch_size = config.levanter_batch_size
+        if sample_path is not None:
+            avg_rg_rows = _avg_parquet_row_group_rows(sample_path)
+            if avg_rg_rows is not None:
+                half_rg = max(avg_rg_rows // 2, 1)
+                window_size = min(half_rg, _MAX_WINDOW_SIZE)
+                batch_size = half_rg if config.levanter_batch_size is None else config.levanter_batch_size
+                logger.info(
+                    "Parquet source: avg rows/row-group=%d (from %s) → window=%d, levanter batch_size=%d",
+                    avg_rg_rows,
+                    sample_path,
+                    window_size,
+                    batch_size,
+                )
+
         ds = Dataset.from_list(file_groups).flat_map(lambda file_list: file_list).flat_map(load_file)
 
         if config.sample_count is not None:
             logger.info(f"Sampling {config.sample_count} examples from {split_name} set for tokenization")
             ds = ds.take_per_shard(config.sample_count)
 
         temp_shards = (
-            # NOTE: https://github.com/marin-community/marin/issues/2829#issuecomment-3963661943
-            # Window set to 64 ^
-            ds.window(64)
+            ds.window(window_size)
             .map_shard(lambda batches, _: _tokenize_batches(config=config, batches=batches))
             .write_levanter_cache(
                 f"{prefix}/part-{{shard:05d}}-of-{{total:05d}}",
                 metadata={},
                 skip_existing=True,
-                batch_size=config.levanter_batch_size,
+                batch_size=batch_size,
             )
         )