|
2 | 2 | # SPDX-License-Identifier: Apache-2.0 |
3 | 3 |
|
4 | 4 | """ |
5 | | -#5056: Raw web, markup, and image-text PPL slices. |
6 | | -
|
7 | | -Parent: #5005. |
8 | | -
|
9 | | -Byte-level perplexity-gap eval slices that preserve surface syntax normally |
10 | | -stripped by cleaned web corpora (HTML, WARC/WAT metadata, web tables, SVG XML, |
11 | | -OCR strings, captions, alt-text, EXIF-like metadata, URL-heavy records). The |
12 | | -goal is to surface perplexity-gap buckets that cleaned-corpus slices |
13 | | -(Paloma / uncheatable-eval) hide. |
14 | | -
|
15 | | -Targets (tracked in #5056): |
16 | | - - Common Crawl WARC/WAT: raw HTTP headers, raw HTML, WAT JSON. |
17 | | - - Web Data Commons Web Tables: raw <table> HTML plus extracted JSON metadata. |
18 | | - - SVG-Stack: SVG XML programs and captions. |
19 | | - - TextOCR / OCR-VQA: OCR strings and scene-text annotations. |
20 | | - - LAION metadata: URL / alt-text / EXIF-like fields — deferred pending |
21 | | - explicit subset selection and safety-filter review. |
22 | | -
|
23 | | -Per #5056 design review, slices are oversplit by surface form (one entry per |
24 | | -surface form, e.g. `raw_web_markup/cc_warc_html`, `raw_web_markup/cc_wat_json`) |
25 | | -so the gap-finder bucket analysis in `marin/evaluation/perplexity_gap.py` |
26 | | -stays clean. Grouping happens post-hoc via tags. |
27 | | -
|
28 | | -This module is intentionally a registration point: downloaders land in |
29 | | -follow-up PRs and populate ``ACTIVE_RAW_WEB_MARKUP_DATASETS``. The aggregator |
30 | | -``raw_web_markup_raw_validation_sets()`` is wired into |
31 | | -``experiments/defaults.py::default_raw_validation_sets()`` so new slices flow |
32 | | -into ``exp_model_perplexity_gap_marin_vs_llama.py`` and its siblings without |
33 | | -touching any other file. |
| 5 | +Registry helpers for raw web, markup, and image-text perplexity-gap slices. |
34 | 6 | """ |
35 | 7 |
|
36 | | -import posixpath |
37 | | -from typing import Any |
| 8 | +import os |
| 9 | +from collections.abc import Mapping |
| 10 | + |
| 11 | +from marin.evaluation.perplexity_gap import RawTextEvaluationDataset |
38 | 12 |
|
39 | | -# Prefix applied to every slice name so the gap-finder report groups the new |
40 | | -# surface-preserving slices together. Top-level constant per CLAUDE.md. |
41 | 13 | RAW_WEB_MARKUP_PREFIX = "raw_web_markup" |
42 | 14 |
|
43 | | -# Populated by follow-up PRs. Keys are slice names relative to |
44 | | -# ``RAW_WEB_MARKUP_PREFIX`` (e.g. ``cc_warc_html``); values are |
45 | | -# ``RawTextEvaluationDataset`` instances produced via |
46 | | -# ``marin.evaluation.perplexity_gap.raw_text_dataset``. |
47 | | -# |
48 | | -# Convention: each source contributes one entry per surface form. Do not |
49 | | -# concatenate surfaces into a single ``text`` stream — the gap-finder truncates |
50 | | -# each doc to ``max_doc_bytes=32_768`` and reports per-slice bpb, so mixing |
51 | | -# surfaces inside a slice loses the signal we want. |
52 | | -ACTIVE_RAW_WEB_MARKUP_DATASETS: dict[str, Any] = {} |
| 15 | +ACTIVE_RAW_WEB_MARKUP_DATASETS: dict[str, RawTextEvaluationDataset] = {} |
| 16 | + |
53 | 17 |
|
| 18 | +def prefixed_raw_web_markup_validation_sets( |
| 19 | + datasets: Mapping[str, RawTextEvaluationDataset], |
| 20 | +) -> dict[str, RawTextEvaluationDataset]: |
| 21 | + """Prefix raw-web-markup slice names with ``raw_web_markup/``.""" |
| 22 | + return {os.path.join(RAW_WEB_MARKUP_PREFIX, slice_name): dataset for slice_name, dataset in datasets.items()} |
54 | 23 |
|
55 | | -def raw_web_markup_raw_validation_sets() -> dict[str, Any]: |
56 | | - """Return raw-text eval slices covering web markup and image-adjacent text. |
57 | 24 |
|
58 | | - Slice names are prefixed with :data:`RAW_WEB_MARKUP_PREFIX`. Returns an |
59 | | - empty mapping until downloaders land in the follow-up PRs tracked by |
60 | | - #5056; callers should treat an empty result as "no raw-web-markup slices |
61 | | - are registered yet", not as an error. |
62 | | - """ |
63 | | - return { |
64 | | - posixpath.join(RAW_WEB_MARKUP_PREFIX, slice_name): dataset |
65 | | - for slice_name, dataset in ACTIVE_RAW_WEB_MARKUP_DATASETS.items() |
66 | | - } |
| 25 | +def raw_web_markup_raw_validation_sets() -> dict[str, RawTextEvaluationDataset]: |
| 26 | + """Raw web/markup evaluation slices keyed by ``raw_web_markup/<slice>``.""" |
| 27 | + return prefixed_raw_web_markup_validation_sets(ACTIVE_RAW_WEB_MARKUP_DATASETS) |
0 commit comments