-
Notifications
You must be signed in to change notification settings - Fork 111
[evals] Add SVG-backed raw web markup PPL slices #5130
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,70 @@ | ||
| # Copyright The Marin Authors | ||
| # SPDX-License-Identifier: Apache-2.0 | ||
|
|
||
| """ | ||
| Registry helpers for raw web, markup, and image-text perplexity-gap slices. | ||
|
|
||
| This module stays opt-in: call ``raw_web_markup_raw_validation_sets()`` explicitly | ||
| from a pilot gap experiment instead of extending ``default_raw_validation_sets()``. | ||
| The first non-empty slices use ``starvector/svg-stack`` directly from Hugging Face | ||
| so we can preserve exact SVG XML without adding a downloader. | ||
| """ | ||
|
|
||
| import os | ||
| from collections.abc import Mapping | ||
|
|
||
| from marin.evaluation.perplexity_gap import RawTextEvaluationDataset, raw_text_dataset | ||
| from marin.processing.tokenize import HfDatasetSpec | ||
|
|
||
| RAW_WEB_MARKUP_PREFIX = "raw_web_markup" | ||
| RAW_WEB_MARKUP_ISSUE_TAG = "issue:5056" | ||
| SVG_STACK_DATASET = HfDatasetSpec(id="starvector/svg-stack") | ||
| SVG_STACK_SOURCE_TAG = "source:svg_stack" | ||
| SVG_XML_SURFACE_TAG = "surface:svg_xml" | ||
| SVG_TEXT_KEY = "Svg" | ||
|
|
||
|
|
||
| def _hf_raw_web_markup_dataset( | ||
| hf_dataset: HfDatasetSpec, | ||
| *, | ||
| text_key: str, | ||
| split: str, | ||
| source_tag: str, | ||
| surface_tag: str, | ||
| ) -> RawTextEvaluationDataset: | ||
| return raw_text_dataset( | ||
| hf_dataset, | ||
| text_key=text_key, | ||
| split=split, | ||
| tags=(RAW_WEB_MARKUP_PREFIX, RAW_WEB_MARKUP_ISSUE_TAG, source_tag, surface_tag, f"split:{split}"), | ||
| ) | ||
|
|
||
|
|
||
| ACTIVE_RAW_WEB_MARKUP_DATASETS: dict[str, RawTextEvaluationDataset] = { | ||
| os.path.join("svg_stack", "svg_xml_val"): _hf_raw_web_markup_dataset( | ||
| SVG_STACK_DATASET, | ||
| text_key=SVG_TEXT_KEY, | ||
| split="val", | ||
| source_tag=SVG_STACK_SOURCE_TAG, | ||
| surface_tag=SVG_XML_SURFACE_TAG, | ||
| ), | ||
| os.path.join("svg_stack", "svg_xml_test"): _hf_raw_web_markup_dataset( | ||
| SVG_STACK_DATASET, | ||
| text_key=SVG_TEXT_KEY, | ||
| split="test", | ||
| source_tag=SVG_STACK_SOURCE_TAG, | ||
| surface_tag=SVG_XML_SURFACE_TAG, | ||
| ), | ||
| } | ||
|
|
||
|
|
||
| def prefixed_raw_web_markup_validation_sets( | ||
| datasets: Mapping[str, RawTextEvaluationDataset], | ||
| ) -> dict[str, RawTextEvaluationDataset]: | ||
| """Prefix raw-web-markup slice names with ``raw_web_markup/``.""" | ||
| return {os.path.join(RAW_WEB_MARKUP_PREFIX, slice_name): dataset for slice_name, dataset in datasets.items()} | ||
|
|
||
|
|
||
| def raw_web_markup_raw_validation_sets() -> dict[str, RawTextEvaluationDataset]: | ||
| """Raw web/markup evaluation slices keyed by ``raw_web_markup/<slice>``.""" | ||
| return prefixed_raw_web_markup_validation_sets(ACTIVE_RAW_WEB_MARKUP_DATASETS) | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,80 @@ | ||
| # Copyright The Marin Authors | ||
| # SPDX-License-Identifier: Apache-2.0 | ||
|
|
||
| import os | ||
|
|
||
| import pytest | ||
|
|
||
| from experiments import exp5056_raw_web_markup_ppl as raw_web_markup | ||
| from levanter.data.text import HfDatasetSourceConfig | ||
| from marin.evaluation.perplexity_gap import ( | ||
| RawTextEvaluationDataset, | ||
| _to_dataset_component, | ||
| raw_text_dataset, | ||
| ) | ||
|
|
||
|
|
||
| def test_prefixed_raw_web_markup_validation_sets_prefixes_each_slice() -> None: | ||
| warc = RawTextEvaluationDataset(input_path="raw/common_crawl/warc.jsonl.gz") | ||
| wat = RawTextEvaluationDataset(input_path="raw/common_crawl/wat.jsonl.gz") | ||
|
|
||
| prefixed = raw_web_markup.prefixed_raw_web_markup_validation_sets( | ||
| { | ||
| "cc_warc_html": warc, | ||
| "cc_wat_json": wat, | ||
| } | ||
| ) | ||
|
|
||
| assert prefixed == { | ||
| os.path.join(raw_web_markup.RAW_WEB_MARKUP_PREFIX, "cc_warc_html"): warc, | ||
| os.path.join(raw_web_markup.RAW_WEB_MARKUP_PREFIX, "cc_wat_json"): wat, | ||
| } | ||
|
|
||
|
|
||
| def test_raw_web_markup_raw_validation_sets_reads_active_registry(monkeypatch) -> None: | ||
| svg = RawTextEvaluationDataset(input_path="raw/svg_stack/svg.xml.jsonl.gz") | ||
|
|
||
| monkeypatch.setattr(raw_web_markup, "ACTIVE_RAW_WEB_MARKUP_DATASETS", {"svg_xml": svg}) | ||
|
|
||
| assert raw_web_markup.raw_web_markup_raw_validation_sets() == { | ||
| os.path.join(raw_web_markup.RAW_WEB_MARKUP_PREFIX, "svg_xml"): svg | ||
| } | ||
|
|
||
|
|
||
| def test_raw_web_markup_raw_validation_sets_registers_svg_stack_hf_slices() -> None: | ||
| datasets = raw_web_markup.raw_web_markup_raw_validation_sets() | ||
|
|
||
| val_key = os.path.join(raw_web_markup.RAW_WEB_MARKUP_PREFIX, "svg_stack", "svg_xml_val") | ||
| test_key = os.path.join(raw_web_markup.RAW_WEB_MARKUP_PREFIX, "svg_stack", "svg_xml_test") | ||
|
|
||
| assert set(datasets) == {val_key, test_key} | ||
|
|
||
| val_dataset = datasets[val_key] | ||
| assert val_dataset.hf_dataset_id == "starvector/svg-stack" | ||
| assert val_dataset.text_key == "Svg" | ||
| assert val_dataset.split == "val" | ||
| assert val_dataset.tags == ( | ||
| "raw_web_markup", | ||
| "issue:5056", | ||
| "source:svg_stack", | ||
| "surface:svg_xml", | ||
| "split:val", | ||
| ) | ||
|
|
||
|
|
||
| def test_svg_stack_slice_materializes_as_hf_dataset_component() -> None: | ||
| datasets = raw_web_markup.raw_web_markup_raw_validation_sets() | ||
| component = _to_dataset_component( | ||
| datasets[os.path.join(raw_web_markup.RAW_WEB_MARKUP_PREFIX, "svg_stack", "svg_xml_test")] | ||
| ) | ||
|
|
||
| assert isinstance(component.source, HfDatasetSourceConfig) | ||
| assert component.source.id == "starvector/svg-stack" | ||
| assert component.source.splits == ["test"] | ||
| assert component.format.text_key == "Svg" | ||
| assert component.tags == ["raw_web_markup", "issue:5056", "source:svg_stack", "surface:svg_xml", "split:test"] | ||
|
Comment on lines
+65
to
+75
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This test only asserts Would be worth adding a test that invokes |
||
|
|
||
|
|
||
| def test_file_backed_raw_web_markup_dataset_rejects_non_validation_split() -> None: | ||
| with pytest.raises(ValueError, match="Hugging Face dataset sources"): | ||
| raw_text_dataset("gs://example-bucket/raw_web_markup.jsonl.gz", split="test") | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
os.path.joinhere producessvg_stack/svg_xml_valon Linux, which matches the existing paloma/uncheatable conventions, so fine — but note these are logical dictionary keys, not filesystem paths. A literal"svg_stack/svg_xml_val"(or a small helper like_slice_key(source, name)) would read more clearly and wouldn't suggest filesystem semantics. Optional stylistic nit; feel free to ignore.