Skip to content

Commit fcc6094

Browse files
committed
[evals] Keep raw-web-markup registry helper out of default validation sets
1 parent e317bf1 commit fcc6094

3 files changed

Lines changed: 49 additions & 58 deletions

File tree

experiments/defaults.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -312,13 +312,9 @@ def default_validation_sets(tokenizer: str, base_path: str = "tokenized/") -> di
312312
@lru_cache
313313
def default_raw_validation_sets() -> dict[str, Any]:
314314
from experiments.evals.exp1600_uncheatable_evals import uncheatable_eval_raw_validation_sets
315-
from experiments.exp5056_raw_web_markup_ppl import raw_web_markup_raw_validation_sets
316315

317316
validation_sets = dict(paloma_raw_validation_sets())
318317
validation_sets.update(uncheatable_eval_raw_validation_sets())
319-
# Surface-preserving web/markup/image-text slices (#5056, parent #5005).
320-
# Empty until downloaders land; see experiments/exp5056_raw_web_markup_ppl.py.
321-
validation_sets.update(raw_web_markup_raw_validation_sets())
322318
return validation_sets
323319

324320

experiments/exp5056_raw_web_markup_ppl.py

Lines changed: 15 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -2,65 +2,26 @@
22
# SPDX-License-Identifier: Apache-2.0
33

44
"""
5-
#5056: Raw web, markup, and image-text PPL slices.
6-
7-
Parent: #5005.
8-
9-
Byte-level perplexity-gap eval slices that preserve surface syntax normally
10-
stripped by cleaned web corpora (HTML, WARC/WAT metadata, web tables, SVG XML,
11-
OCR strings, captions, alt-text, EXIF-like metadata, URL-heavy records). The
12-
goal is to surface perplexity-gap buckets that cleaned-corpus slices
13-
(Paloma / uncheatable-eval) hide.
14-
15-
Targets (tracked in #5056):
16-
- Common Crawl WARC/WAT: raw HTTP headers, raw HTML, WAT JSON.
17-
- Web Data Commons Web Tables: raw <table> HTML plus extracted JSON metadata.
18-
- SVG-Stack: SVG XML programs and captions.
19-
- TextOCR / OCR-VQA: OCR strings and scene-text annotations.
20-
- LAION metadata: URL / alt-text / EXIF-like fields — deferred pending
21-
explicit subset selection and safety-filter review.
22-
23-
Per #5056 design review, slices are oversplit by surface form (one entry per
24-
surface form, e.g. `raw_web_markup/cc_warc_html`, `raw_web_markup/cc_wat_json`)
25-
so the gap-finder bucket analysis in `marin/evaluation/perplexity_gap.py`
26-
stays clean. Grouping happens post-hoc via tags.
27-
28-
This module is intentionally a registration point: downloaders land in
29-
follow-up PRs and populate ``ACTIVE_RAW_WEB_MARKUP_DATASETS``. The aggregator
30-
``raw_web_markup_raw_validation_sets()`` is wired into
31-
``experiments/defaults.py::default_raw_validation_sets()`` so new slices flow
32-
into ``exp_model_perplexity_gap_marin_vs_llama.py`` and its siblings without
33-
touching any other file.
5+
Registry helpers for raw web, markup, and image-text perplexity-gap slices.
346
"""
357

36-
import posixpath
37-
from typing import Any
8+
import os
9+
from collections.abc import Mapping
10+
11+
from marin.evaluation.perplexity_gap import RawTextEvaluationDataset
3812

39-
# Prefix applied to every slice name so the gap-finder report groups the new
40-
# surface-preserving slices together. Top-level constant per CLAUDE.md.
4113
RAW_WEB_MARKUP_PREFIX = "raw_web_markup"
4214

43-
# Populated by follow-up PRs. Keys are slice names relative to
44-
# ``RAW_WEB_MARKUP_PREFIX`` (e.g. ``cc_warc_html``); values are
45-
# ``RawTextEvaluationDataset`` instances produced via
46-
# ``marin.evaluation.perplexity_gap.raw_text_dataset``.
47-
#
48-
# Convention: each source contributes one entry per surface form. Do not
49-
# concatenate surfaces into a single ``text`` stream — the gap-finder truncates
50-
# each doc to ``max_doc_bytes=32_768`` and reports per-slice bpb, so mixing
51-
# surfaces inside a slice loses the signal we want.
52-
ACTIVE_RAW_WEB_MARKUP_DATASETS: dict[str, Any] = {}
15+
ACTIVE_RAW_WEB_MARKUP_DATASETS: dict[str, RawTextEvaluationDataset] = {}
16+
5317

18+
def prefixed_raw_web_markup_validation_sets(
19+
datasets: Mapping[str, RawTextEvaluationDataset],
20+
) -> dict[str, RawTextEvaluationDataset]:
21+
"""Prefix raw-web-markup slice names with ``raw_web_markup/``."""
22+
return {os.path.join(RAW_WEB_MARKUP_PREFIX, slice_name): dataset for slice_name, dataset in datasets.items()}
5423

55-
def raw_web_markup_raw_validation_sets() -> dict[str, Any]:
56-
"""Return raw-text eval slices covering web markup and image-adjacent text.
5724

58-
Slice names are prefixed with :data:`RAW_WEB_MARKUP_PREFIX`. Returns an
59-
empty mapping until downloaders land in the follow-up PRs tracked by
60-
#5056; callers should treat an empty result as "no raw-web-markup slices
61-
are registered yet", not as an error.
62-
"""
63-
return {
64-
posixpath.join(RAW_WEB_MARKUP_PREFIX, slice_name): dataset
65-
for slice_name, dataset in ACTIVE_RAW_WEB_MARKUP_DATASETS.items()
66-
}
25+
def raw_web_markup_raw_validation_sets() -> dict[str, RawTextEvaluationDataset]:
26+
"""Raw web/markup evaluation slices keyed by ``raw_web_markup/<slice>``."""
27+
return prefixed_raw_web_markup_validation_sets(ACTIVE_RAW_WEB_MARKUP_DATASETS)
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
# Copyright The Marin Authors
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
import os
5+
6+
from experiments import exp5056_raw_web_markup_ppl as raw_web_markup
7+
from marin.evaluation.perplexity_gap import RawTextEvaluationDataset
8+
9+
10+
def test_prefixed_raw_web_markup_validation_sets_prefixes_each_slice() -> None:
11+
warc = RawTextEvaluationDataset(input_path="raw/common_crawl/warc.jsonl.gz")
12+
wat = RawTextEvaluationDataset(input_path="raw/common_crawl/wat.jsonl.gz")
13+
14+
prefixed = raw_web_markup.prefixed_raw_web_markup_validation_sets(
15+
{
16+
"cc_warc_html": warc,
17+
"cc_wat_json": wat,
18+
}
19+
)
20+
21+
assert prefixed == {
22+
os.path.join(raw_web_markup.RAW_WEB_MARKUP_PREFIX, "cc_warc_html"): warc,
23+
os.path.join(raw_web_markup.RAW_WEB_MARKUP_PREFIX, "cc_wat_json"): wat,
24+
}
25+
26+
27+
def test_raw_web_markup_raw_validation_sets_reads_active_registry(monkeypatch) -> None:
28+
svg = RawTextEvaluationDataset(input_path="raw/svg_stack/svg.xml.jsonl.gz")
29+
30+
monkeypatch.setattr(raw_web_markup, "ACTIVE_RAW_WEB_MARKUP_DATASETS", {"svg_xml": svg})
31+
32+
assert raw_web_markup.raw_web_markup_raw_validation_sets() == {
33+
os.path.join(raw_web_markup.RAW_WEB_MARKUP_PREFIX, "svg_xml"): svg
34+
}

0 commit comments

Comments
 (0)