Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
70 changes: 70 additions & 0 deletions experiments/exp5056_raw_web_markup_ppl.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
# Copyright The Marin Authors
# SPDX-License-Identifier: Apache-2.0

"""
Registry helpers for raw web, markup, and image-text perplexity-gap slices.

This module stays opt-in: call ``raw_web_markup_raw_validation_sets()`` explicitly
from a pilot gap experiment instead of extending ``default_raw_validation_sets()``.
The first non-empty slices use ``starvector/svg-stack`` directly from Hugging Face
so we can preserve exact SVG XML without adding a downloader.
"""

import os
from collections.abc import Mapping

from marin.evaluation.perplexity_gap import RawTextEvaluationDataset, raw_text_dataset
from marin.processing.tokenize import HfDatasetSpec

RAW_WEB_MARKUP_PREFIX = "raw_web_markup"
RAW_WEB_MARKUP_ISSUE_TAG = "issue:5056"
SVG_STACK_DATASET = HfDatasetSpec(id="starvector/svg-stack")
SVG_STACK_SOURCE_TAG = "source:svg_stack"
SVG_XML_SURFACE_TAG = "surface:svg_xml"
SVG_TEXT_KEY = "Svg"


def _hf_raw_web_markup_dataset(
hf_dataset: HfDatasetSpec,
*,
text_key: str,
split: str,
source_tag: str,
surface_tag: str,
) -> RawTextEvaluationDataset:
return raw_text_dataset(
hf_dataset,
text_key=text_key,
split=split,
tags=(RAW_WEB_MARKUP_PREFIX, RAW_WEB_MARKUP_ISSUE_TAG, source_tag, surface_tag, f"split:{split}"),
)


ACTIVE_RAW_WEB_MARKUP_DATASETS: dict[str, RawTextEvaluationDataset] = {
os.path.join("svg_stack", "svg_xml_val"): _hf_raw_web_markup_dataset(
SVG_STACK_DATASET,
text_key=SVG_TEXT_KEY,
split="val",
source_tag=SVG_STACK_SOURCE_TAG,
surface_tag=SVG_XML_SURFACE_TAG,
),
os.path.join("svg_stack", "svg_xml_test"): _hf_raw_web_markup_dataset(
SVG_STACK_DATASET,
text_key=SVG_TEXT_KEY,
split="test",
source_tag=SVG_STACK_SOURCE_TAG,
surface_tag=SVG_XML_SURFACE_TAG,
),
}
Comment on lines +43 to +58
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

os.path.join here produces svg_stack/svg_xml_val on Linux, which matches the existing paloma/uncheatable conventions, so fine — but note these are logical dictionary keys, not filesystem paths. A literal "svg_stack/svg_xml_val" (or a small helper like _slice_key(source, name)) would read more clearly and wouldn't suggest filesystem semantics. Optional stylistic nit; feel free to ignore.



def prefixed_raw_web_markup_validation_sets(
datasets: Mapping[str, RawTextEvaluationDataset],
) -> dict[str, RawTextEvaluationDataset]:
"""Prefix raw-web-markup slice names with ``raw_web_markup/``."""
return {os.path.join(RAW_WEB_MARKUP_PREFIX, slice_name): dataset for slice_name, dataset in datasets.items()}


def raw_web_markup_raw_validation_sets() -> dict[str, RawTextEvaluationDataset]:
"""Raw web/markup evaluation slices keyed by ``raw_web_markup/<slice>``."""
return prefixed_raw_web_markup_validation_sets(ACTIVE_RAW_WEB_MARKUP_DATASETS)
11 changes: 10 additions & 1 deletion lib/marin/src/marin/evaluation/perplexity_gap.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ class RawTextEvaluationDataset:
hf_dataset_id: str | None = None
hf_dataset_name: str | None = None
text_key: str = "text"
split: str = "validation"
tags: tuple[str, ...] = ()


Expand All @@ -63,16 +64,20 @@ def raw_text_dataset(
source: str | InputName | ExecutorStep | HfDatasetSpec,
*,
text_key: str = "text",
split: str = "validation",
tags: tuple[str, ...] = (),
) -> RawTextEvaluationDataset:
if isinstance(source, HfDatasetSpec):
return RawTextEvaluationDataset(
hf_dataset_id=source.id,
hf_dataset_name=source.name,
text_key=text_key,
split=split,
tags=tags,
)
return RawTextEvaluationDataset(input_path=source, text_key=text_key, tags=tags)
if split != "validation":
raise ValueError("split is only supported for Hugging Face dataset sources; file paths use validation.")
return RawTextEvaluationDataset(input_path=source, text_key=text_key, split=split, tags=tags)


def default_model_perplexity_gap(
Expand Down Expand Up @@ -184,10 +189,13 @@ def _to_dataset_component(config: RawTextEvaluationDataset) -> DatasetComponent:
id=config.hf_dataset_id,
name=config.hf_dataset_name,
format=dataset_format,
splits=[config.split],
)
else:
if config.input_path is None:
raise ValueError("RawTextEvaluationDataset requires either input_path or hf_dataset_id.")
if config.split != "validation":
raise ValueError("RawTextEvaluationDataset split is only supported for Hugging Face dataset sources.")
input_path = config.input_path
if isinstance(input_path, ExecutorStep):
input_path = input_path.as_input_name()
Expand Down Expand Up @@ -234,5 +242,6 @@ def _cache_key_for_dataset(dataset: RawTextEvaluationDataset) -> dict[str, Any]:
"hf_dataset_id": dataset.hf_dataset_id,
"hf_dataset_name": dataset.hf_dataset_name,
"text_key": dataset.text_key,
"split": dataset.split,
"tags": dataset.tags,
}
80 changes: 80 additions & 0 deletions tests/evals/test_exp5056_raw_web_markup_ppl.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
# Copyright The Marin Authors
# SPDX-License-Identifier: Apache-2.0

import os

import pytest

from experiments import exp5056_raw_web_markup_ppl as raw_web_markup
from levanter.data.text import HfDatasetSourceConfig
from marin.evaluation.perplexity_gap import (
RawTextEvaluationDataset,
_to_dataset_component,
raw_text_dataset,
)


def test_prefixed_raw_web_markup_validation_sets_prefixes_each_slice() -> None:
warc = RawTextEvaluationDataset(input_path="raw/common_crawl/warc.jsonl.gz")
wat = RawTextEvaluationDataset(input_path="raw/common_crawl/wat.jsonl.gz")

prefixed = raw_web_markup.prefixed_raw_web_markup_validation_sets(
{
"cc_warc_html": warc,
"cc_wat_json": wat,
}
)

assert prefixed == {
os.path.join(raw_web_markup.RAW_WEB_MARKUP_PREFIX, "cc_warc_html"): warc,
os.path.join(raw_web_markup.RAW_WEB_MARKUP_PREFIX, "cc_wat_json"): wat,
}


def test_raw_web_markup_raw_validation_sets_reads_active_registry(monkeypatch) -> None:
svg = RawTextEvaluationDataset(input_path="raw/svg_stack/svg.xml.jsonl.gz")

monkeypatch.setattr(raw_web_markup, "ACTIVE_RAW_WEB_MARKUP_DATASETS", {"svg_xml": svg})

assert raw_web_markup.raw_web_markup_raw_validation_sets() == {
os.path.join(raw_web_markup.RAW_WEB_MARKUP_PREFIX, "svg_xml"): svg
}


def test_raw_web_markup_raw_validation_sets_registers_svg_stack_hf_slices() -> None:
datasets = raw_web_markup.raw_web_markup_raw_validation_sets()

val_key = os.path.join(raw_web_markup.RAW_WEB_MARKUP_PREFIX, "svg_stack", "svg_xml_val")
test_key = os.path.join(raw_web_markup.RAW_WEB_MARKUP_PREFIX, "svg_stack", "svg_xml_test")

assert set(datasets) == {val_key, test_key}

val_dataset = datasets[val_key]
assert val_dataset.hf_dataset_id == "starvector/svg-stack"
assert val_dataset.text_key == "Svg"
assert val_dataset.split == "val"
assert val_dataset.tags == (
"raw_web_markup",
"issue:5056",
"source:svg_stack",
"surface:svg_xml",
"split:val",
)


def test_svg_stack_slice_materializes_as_hf_dataset_component() -> None:
datasets = raw_web_markup.raw_web_markup_raw_validation_sets()
component = _to_dataset_component(
datasets[os.path.join(raw_web_markup.RAW_WEB_MARKUP_PREFIX, "svg_stack", "svg_xml_test")]
)

assert isinstance(component.source, HfDatasetSourceConfig)
assert component.source.id == "starvector/svg-stack"
assert component.source.splits == ["test"]
assert component.format.text_key == "Svg"
assert component.tags == ["raw_web_markup", "issue:5056", "source:svg_stack", "surface:svg_xml", "split:test"]
Comment on lines +65 to +75
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This test only asserts component.source.splits == ["test"], which is the value you set, not the value the gap finder actually uses. At runtime levanter/analysis/perplexity_gap.py:434 calls component.source.get_shard_source("validation") — the literal string — so with splits=["test"] (or ["val"]) this returns None and the slice is silently skipped.

Would be worth adding a test that invokes iter_raw_text_documents (or at least component.source.get_shard_source("validation")) against an HfDatasetSourceConfig built from a non-validation split, so the next regression of this kind fails loudly.



def test_file_backed_raw_web_markup_dataset_rejects_non_validation_split() -> None:
with pytest.raises(ValueError, match="Hugging Face dataset sources"):
raw_text_dataset("gs://example-bucket/raw_web_markup.jsonl.gz", split="test")
Loading