[evals] Keep raw-web-markup registry helper out of default validation sets

dlwh · dlwh · commit fcc6094ed947 · 2026-04-22T15:08:40.000-07:00
diff --git a/experiments/defaults.py b/experiments/defaults.py
@@ -312,13 +312,9 @@ def default_validation_sets(tokenizer: str, base_path: str = "tokenized/") -> di
 @lru_cache
 def default_raw_validation_sets() -> dict[str, Any]:
     from experiments.evals.exp1600_uncheatable_evals import uncheatable_eval_raw_validation_sets
-    from experiments.exp5056_raw_web_markup_ppl import raw_web_markup_raw_validation_sets
 
     validation_sets = dict(paloma_raw_validation_sets())
     validation_sets.update(uncheatable_eval_raw_validation_sets())
-    # Surface-preserving web/markup/image-text slices (#5056, parent #5005).
-    # Empty until downloaders land; see experiments/exp5056_raw_web_markup_ppl.py.
-    validation_sets.update(raw_web_markup_raw_validation_sets())
     return validation_sets
 
 
diff --git a/experiments/exp5056_raw_web_markup_ppl.py b/experiments/exp5056_raw_web_markup_ppl.py
@@ -2,65 +2,26 @@
 # SPDX-License-Identifier: Apache-2.0
 
 """
-#5056: Raw web, markup, and image-text PPL slices.
-
-Parent: #5005.
-
-Byte-level perplexity-gap eval slices that preserve surface syntax normally
-stripped by cleaned web corpora (HTML, WARC/WAT metadata, web tables, SVG XML,
-OCR strings, captions, alt-text, EXIF-like metadata, URL-heavy records). The
-goal is to surface perplexity-gap buckets that cleaned-corpus slices
-(Paloma / uncheatable-eval) hide.
-
-Targets (tracked in #5056):
-  - Common Crawl WARC/WAT: raw HTTP headers, raw HTML, WAT JSON.
-  - Web Data Commons Web Tables: raw <table> HTML plus extracted JSON metadata.
-  - SVG-Stack: SVG XML programs and captions.
-  - TextOCR / OCR-VQA: OCR strings and scene-text annotations.
-  - LAION metadata: URL / alt-text / EXIF-like fields — deferred pending
-    explicit subset selection and safety-filter review.
-
-Per #5056 design review, slices are oversplit by surface form (one entry per
-surface form, e.g. `raw_web_markup/cc_warc_html`, `raw_web_markup/cc_wat_json`)
-so the gap-finder bucket analysis in `marin/evaluation/perplexity_gap.py`
-stays clean. Grouping happens post-hoc via tags.
-
-This module is intentionally a registration point: downloaders land in
-follow-up PRs and populate ``ACTIVE_RAW_WEB_MARKUP_DATASETS``. The aggregator
-``raw_web_markup_raw_validation_sets()`` is wired into
-``experiments/defaults.py::default_raw_validation_sets()`` so new slices flow
-into ``exp_model_perplexity_gap_marin_vs_llama.py`` and its siblings without
-touching any other file.
+Registry helpers for raw web, markup, and image-text perplexity-gap slices.
 """
 
-import posixpath
-from typing import Any
+import os
+from collections.abc import Mapping
+
+from marin.evaluation.perplexity_gap import RawTextEvaluationDataset
 
-# Prefix applied to every slice name so the gap-finder report groups the new
-# surface-preserving slices together. Top-level constant per CLAUDE.md.
 RAW_WEB_MARKUP_PREFIX = "raw_web_markup"
 
-# Populated by follow-up PRs. Keys are slice names relative to
-# ``RAW_WEB_MARKUP_PREFIX`` (e.g. ``cc_warc_html``); values are
-# ``RawTextEvaluationDataset`` instances produced via
-# ``marin.evaluation.perplexity_gap.raw_text_dataset``.
-#
-# Convention: each source contributes one entry per surface form. Do not
-# concatenate surfaces into a single ``text`` stream — the gap-finder truncates
-# each doc to ``max_doc_bytes=32_768`` and reports per-slice bpb, so mixing
-# surfaces inside a slice loses the signal we want.
-ACTIVE_RAW_WEB_MARKUP_DATASETS: dict[str, Any] = {}
+ACTIVE_RAW_WEB_MARKUP_DATASETS: dict[str, RawTextEvaluationDataset] = {}
+
 
+def prefixed_raw_web_markup_validation_sets(
+    datasets: Mapping[str, RawTextEvaluationDataset],
+) -> dict[str, RawTextEvaluationDataset]:
+    """Prefix raw-web-markup slice names with ``raw_web_markup/``."""
+    return {os.path.join(RAW_WEB_MARKUP_PREFIX, slice_name): dataset for slice_name, dataset in datasets.items()}
 
-def raw_web_markup_raw_validation_sets() -> dict[str, Any]:
-    """Return raw-text eval slices covering web markup and image-adjacent text.
 
-    Slice names are prefixed with :data:`RAW_WEB_MARKUP_PREFIX`. Returns an
-    empty mapping until downloaders land in the follow-up PRs tracked by
-    #5056; callers should treat an empty result as "no raw-web-markup slices
-    are registered yet", not as an error.
-    """
-    return {
-        posixpath.join(RAW_WEB_MARKUP_PREFIX, slice_name): dataset
-        for slice_name, dataset in ACTIVE_RAW_WEB_MARKUP_DATASETS.items()
-    }
+def raw_web_markup_raw_validation_sets() -> dict[str, RawTextEvaluationDataset]:
+    """Raw web/markup evaluation slices keyed by ``raw_web_markup/<slice>``."""
+    return prefixed_raw_web_markup_validation_sets(ACTIVE_RAW_WEB_MARKUP_DATASETS)
diff --git a/tests/evals/test_exp5056_raw_web_markup_ppl.py b/tests/evals/test_exp5056_raw_web_markup_ppl.py
@@ -0,0 +1,34 @@
+# Copyright The Marin Authors
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+
+from experiments import exp5056_raw_web_markup_ppl as raw_web_markup
+from marin.evaluation.perplexity_gap import RawTextEvaluationDataset
+
+
+def test_prefixed_raw_web_markup_validation_sets_prefixes_each_slice() -> None:
+    warc = RawTextEvaluationDataset(input_path="raw/common_crawl/warc.jsonl.gz")
+    wat = RawTextEvaluationDataset(input_path="raw/common_crawl/wat.jsonl.gz")
+
+    prefixed = raw_web_markup.prefixed_raw_web_markup_validation_sets(
+        {
+            "cc_warc_html": warc,
+            "cc_wat_json": wat,
+        }
+    )
+
+    assert prefixed == {
+        os.path.join(raw_web_markup.RAW_WEB_MARKUP_PREFIX, "cc_warc_html"): warc,
+        os.path.join(raw_web_markup.RAW_WEB_MARKUP_PREFIX, "cc_wat_json"): wat,
+    }
+
+
+def test_raw_web_markup_raw_validation_sets_reads_active_registry(monkeypatch) -> None:
+    svg = RawTextEvaluationDataset(input_path="raw/svg_stack/svg.xml.jsonl.gz")
+
+    monkeypatch.setattr(raw_web_markup, "ACTIVE_RAW_WEB_MARKUP_DATASETS", {"svg_xml": svg})
+
+    assert raw_web_markup.raw_web_markup_raw_validation_sets() == {
+        os.path.join(raw_web_markup.RAW_WEB_MARKUP_PREFIX, "svg_xml"): svg
+    }