[marin] datakit/normalize: compact pathological whitespace runs (#4603)

ravwojdyla-agent · ravwojdyla · web-flow · commit 5bae69b80c6e · 2026-04-15T20:27:29.000Z
## Summary - Add a `max_whitespace_run_chars` option (default 128) to the datakit normalization step. Consecutive whitespace runs exceeding the limit are truncated to that length — **preserving the surrounding content** rather than dropping the entire document. - Handles broken HTML→text extraction artifacts (e.g. multi-MB space runs, cf. #4588) that can OOM downstream tokenization, while keeping the actual useful text. - Affected records are counted via a new Zephyr counter, `datakit_normalize_compacted_whitespace`. Document `id` is recomputed after compaction. - Follow-up to #4600 (which caps homogeneous runs inside the tokenizer). Pass `max_whitespace_run_chars=None` to disable. ## Test plan - [x] `tests/datakit/test_normalize.py` — new cases: verifies compaction preserves content and recomputes id; `None` disables compaction. Existing 10 tests still pass. - [x] `./infra/pre-commit.py --fix` clean. Co-authored-by: Rafal Wojdyla <ravwojdyla@gmail.com>
diff --git a/lib/marin/src/marin/datakit/normalize.py b/lib/marin/src/marin/datakit/normalize.py
@@ -16,6 +16,7 @@
 
 import logging
 import os
+import re
 from collections.abc import Callable, Iterator
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from dataclasses import dataclass, field
@@ -32,6 +33,17 @@
 
 logger = logging.getLogger(__name__)
 
+# Default cap on the longest consecutive whitespace run in a document.
+# Runs exceeding this are compacted to this length at normalization time.
+# Pathologically long whitespace runs (e.g. multi-MB runs from broken
+# HTML→text extraction, cf. #4588) can OOM downstream tokenization.
+# 128 matches the longest whitespace run that Llama's tokenizer collapses
+# into a single token, so capping here is lossless for that tokenizer.
+DEFAULT_MAX_WHITESPACE_RUN_CHARS = 128
+
+# Counter name for documents that had whitespace runs compacted.
+COMPACTED_WHITESPACE_COUNTER = "datakit_normalize_compacted_whitespace"
+
 
 class DedupMode(StrEnum):
     """How aggressively to deduplicate records during normalization.
@@ -183,13 +195,35 @@ def _compute_total_bytes(file_paths: list[str]) -> int:
     return total
 
 
+def _make_whitespace_compactor(max_whitespace_run_chars: int) -> Callable[[dict[str, Any]], dict[str, Any]]:
+    """Return a map function that compacts consecutive whitespace runs exceeding the limit.
+
+    Any run of whitespace longer than *max_whitespace_run_chars* is truncated to
+    that length (preserving the original whitespace characters). Affected records
+    are counted via the ``COMPACTED_WHITESPACE_COUNTER`` Zephyr counter, and the
+    ``id`` is recomputed to reflect the new text.
+    """
+    pattern = re.compile(r"\s{" + str(max_whitespace_run_chars + 1) + r",}")
+
+    def compact(record: dict[str, Any]) -> dict[str, Any]:
+        text = record["text"]
+        compacted = pattern.sub(lambda m: m.group(0)[:max_whitespace_run_chars], text)
+        if len(compacted) != len(text):
+            counters.increment(COMPACTED_WHITESPACE_COUNTER)
+            record = {**record, "text": compacted, "id": generate_id(compacted)}
+        return record
+
+    return compact
+
+
 def _build_pipeline(
     files: list[str],
     output_dir: str,
     num_shards: int,
     text_field: str,
     id_field: str | None,
     dedup_mode: DedupMode,
+    max_whitespace_run_chars: int,
 ) -> Dataset:
     """Build a single Zephyr pipeline for one subdirectory."""
     normalize_record = _make_normalize_fn(text_field, id_field)
@@ -221,6 +255,7 @@ def has_text(record: dict[str, Any]) -> bool:
         .flat_map(load_file)
         .filter(has_text)
         .map(normalize_record)
+        .map(_make_whitespace_compactor(max_whitespace_run_chars))
         .group_by(
             key=lambda r: int(r["id"], 16) % num_shards,
             reducer=reducers[dedup_mode],
@@ -241,6 +276,7 @@ def normalize_to_parquet(
     text_field: str = "text",
     id_field: str = "id",
     target_partition_bytes: int = 256 * 1024 * 1024,
+    max_whitespace_run_chars: int = DEFAULT_MAX_WHITESPACE_RUN_CHARS,
     worker_resources: ResourceConfig | None = None,
     file_extensions: tuple[str, ...] | None = None,
     dedup_mode: DedupMode = DedupMode.EXACT,
@@ -262,6 +298,12 @@ def normalize_to_parquet(
             silently skipped.
         target_partition_bytes: Target size in bytes per output partition.
             Used to compute the number of output shards per subdirectory.
+        max_whitespace_run_chars: Compact any consecutive whitespace run
+            longer than this many characters down to this length.
+            Pathologically long whitespace runs (e.g. multi-MB runs from
+            broken HTML→text extraction, cf. #4588) can OOM downstream
+            tokenization. Affected records are counted via the
+            ``datakit_normalize_compacted_whitespace`` Zephyr counter.
         worker_resources: Per-worker resource request for the Zephyr pipeline.
             Defaults to 2 CPU / 16GB RAM / 10GB disk, sized for
             ``target_partition_bytes`` of 256MB.  Scale up when increasing
@@ -300,7 +342,15 @@ def _run_subdir(subdir: str, files: list[str]) -> NormalizeSubdirResult:
             num_shards,
         )
 
-        pipeline = _build_pipeline(files, output_dir, num_shards, text_field, id_field, dedup_mode)
+        pipeline = _build_pipeline(
+            files,
+            output_dir,
+            num_shards,
+            text_field,
+            id_field,
+            dedup_mode,
+            max_whitespace_run_chars,
+        )
         ctx = ZephyrContext(
             name=f"normalize-{subdir.replace('/', '-') if subdir else 'all'}",
             resources=resources,
@@ -343,6 +393,7 @@ def normalize_step(
     text_field: str = "text",
     id_field: str = "id",
     target_partition_bytes: int = 256 * 1024 * 1024,
+    max_whitespace_run_chars: int = DEFAULT_MAX_WHITESPACE_RUN_CHARS,
     worker_resources: ResourceConfig | None = None,
     override_output_path: str | None = None,
     input_path: str | None = None,
@@ -378,6 +429,7 @@ def normalize_step(
             text_field=text_field,
             id_field=id_field,
             target_partition_bytes=target_partition_bytes,
+            max_whitespace_run_chars=max_whitespace_run_chars,
             worker_resources=worker_resources,
             file_extensions=file_extensions,
             dedup_mode=dedup_mode,
@@ -387,6 +439,7 @@ def normalize_step(
             "text_field": text_field,
             "id_field": id_field,
             "target_partition_bytes": target_partition_bytes,
+            "max_whitespace_run_chars": max_whitespace_run_chars,
             "input_path": resolved_input,
             "file_extensions": file_extensions,
             "dedup_mode": dedup_mode,
diff --git a/tests/datakit/test_normalize.py b/tests/datakit/test_normalize.py
@@ -203,6 +203,35 @@ def test_skip_existing_idempotent(tmp_path: Path, write_jsonl_gz):
     assert parquet_files[0].stat().st_mtime == mtime_first
 
 
+def test_whitespace_compaction(tmp_path: Path, write_jsonl_gz):
+    """Long whitespace runs are compacted, not dropped. Content is preserved."""
+    input_dir = tmp_path / "input"
+    output_dir = tmp_path / "output"
+
+    records = [
+        {"id": "normal", "text": "Hello world"},
+        {"id": "pathological", "text": "before" + " " * 500 + "after"},
+        {"id": "also_normal", "text": "short  spaces  are  fine"},
+    ]
+    write_jsonl_gz(input_dir / "data.jsonl.gz", records)
+
+    normalize_to_parquet(
+        input_path=str(input_dir),
+        output_path=str(output_dir),
+        max_whitespace_run_chars=100,
+    )
+
+    results = _read_all_parquet(output_dir)
+    # All three records survive — the pathological one is compacted, not dropped
+    assert len(results) == 3
+    by_source = {r["source_id"]: r for r in results}
+    assert by_source["pathological"]["text"] == "before" + " " * 100 + "after"
+    # id is recomputed from the compacted text
+    assert by_source["pathological"]["id"] == generate_id("before" + " " * 100 + "after")
+    # Normal docs are untouched
+    assert by_source["normal"]["text"] == "Hello world"
+
+
 def test_no_input_files_raises(tmp_path: Path):
     input_dir = tmp_path / "input"
     input_dir.mkdir()