diff --git a/experiments/exp5056_raw_web_markup_ppl.py b/experiments/exp5056_raw_web_markup_ppl.py new file mode 100644 index 0000000000..006599ce86 --- /dev/null +++ b/experiments/exp5056_raw_web_markup_ppl.py @@ -0,0 +1,27 @@ +# Copyright The Marin Authors +# SPDX-License-Identifier: Apache-2.0 + +""" +Registry helpers for raw web, markup, and image-text perplexity-gap slices. +""" + +import os +from collections.abc import Mapping + +from marin.evaluation.perplexity_gap import RawTextEvaluationDataset + +RAW_WEB_MARKUP_PREFIX = "raw_web_markup" + +ACTIVE_RAW_WEB_MARKUP_DATASETS: dict[str, RawTextEvaluationDataset] = {} + + +def prefixed_raw_web_markup_validation_sets( + datasets: Mapping[str, RawTextEvaluationDataset], +) -> dict[str, RawTextEvaluationDataset]: + """Prefix raw-web-markup slice names with ``raw_web_markup/``.""" + return {os.path.join(RAW_WEB_MARKUP_PREFIX, slice_name): dataset for slice_name, dataset in datasets.items()} + + +def raw_web_markup_raw_validation_sets() -> dict[str, RawTextEvaluationDataset]: + """Raw web/markup evaluation slices keyed by ``raw_web_markup/``.""" + return prefixed_raw_web_markup_validation_sets(ACTIVE_RAW_WEB_MARKUP_DATASETS) diff --git a/tests/evals/test_exp5056_raw_web_markup_ppl.py b/tests/evals/test_exp5056_raw_web_markup_ppl.py new file mode 100644 index 0000000000..be842b01f8 --- /dev/null +++ b/tests/evals/test_exp5056_raw_web_markup_ppl.py @@ -0,0 +1,34 @@ +# Copyright The Marin Authors +# SPDX-License-Identifier: Apache-2.0 + +import os + +from experiments import exp5056_raw_web_markup_ppl as raw_web_markup +from marin.evaluation.perplexity_gap import RawTextEvaluationDataset + + +def test_prefixed_raw_web_markup_validation_sets_prefixes_each_slice() -> None: + warc = RawTextEvaluationDataset(input_path="raw/common_crawl/warc.jsonl.gz") + wat = RawTextEvaluationDataset(input_path="raw/common_crawl/wat.jsonl.gz") + + prefixed = raw_web_markup.prefixed_raw_web_markup_validation_sets( + { + "cc_warc_html": warc, + "cc_wat_json": wat, + } + ) + + assert prefixed == { + os.path.join(raw_web_markup.RAW_WEB_MARKUP_PREFIX, "cc_warc_html"): warc, + os.path.join(raw_web_markup.RAW_WEB_MARKUP_PREFIX, "cc_wat_json"): wat, + } + + +def test_raw_web_markup_raw_validation_sets_reads_active_registry(monkeypatch) -> None: + svg = RawTextEvaluationDataset(input_path="raw/svg_stack/svg.xml.jsonl.gz") + + monkeypatch.setattr(raw_web_markup, "ACTIVE_RAW_WEB_MARKUP_DATASETS", {"svg_xml": svg}) + + assert raw_web_markup.raw_web_markup_raw_validation_sets() == { + os.path.join(raw_web_markup.RAW_WEB_MARKUP_PREFIX, "svg_xml"): svg + }