Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions experiments/exp5056_raw_web_markup_ppl.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# Copyright The Marin Authors
# SPDX-License-Identifier: Apache-2.0

"""
Registry helpers for raw web, markup, and image-text perplexity-gap slices.
"""
Comment on lines +4 to +6
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🤖 Docstring references tracking issues that will rot.

CLAUDE.md:

Don't reference the current task, fix, or callers ("used by X", "added for the Y flow", "handles the case from issue #123"), since those belong in the PR description and rot as the codebase evolves.

This docstring opens with #5056: … / Parent: #5005, then refers back to "#5056 design review", "tracked in #5056", and "follow-up PRs". Once those issues are closed or renumbered, the docstring becomes misleading.

A module docstring should describe what the module is in the current codebase (one or two sentences about surface-preserving PPL slices, the prefix convention, and where the dataset values come from). Targets and rationale belong in the issue, not here.


import os
from collections.abc import Mapping

from marin.evaluation.perplexity_gap import RawTextEvaluationDataset

RAW_WEB_MARKUP_PREFIX = "raw_web_markup"

ACTIVE_RAW_WEB_MARKUP_DATASETS: dict[str, RawTextEvaluationDataset] = {}


def prefixed_raw_web_markup_validation_sets(
datasets: Mapping[str, RawTextEvaluationDataset],
) -> dict[str, RawTextEvaluationDataset]:
"""Prefix raw-web-markup slice names with ``raw_web_markup/``."""
return {os.path.join(RAW_WEB_MARKUP_PREFIX, slice_name): dataset for slice_name, dataset in datasets.items()}


def raw_web_markup_raw_validation_sets() -> dict[str, RawTextEvaluationDataset]:
"""Raw web/markup evaluation slices keyed by ``raw_web_markup/<slice>``."""
return prefixed_raw_web_markup_validation_sets(ACTIVE_RAW_WEB_MARKUP_DATASETS)
34 changes: 34 additions & 0 deletions tests/evals/test_exp5056_raw_web_markup_ppl.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# Copyright The Marin Authors
# SPDX-License-Identifier: Apache-2.0

import os

from experiments import exp5056_raw_web_markup_ppl as raw_web_markup
from marin.evaluation.perplexity_gap import RawTextEvaluationDataset


def test_prefixed_raw_web_markup_validation_sets_prefixes_each_slice() -> None:
warc = RawTextEvaluationDataset(input_path="raw/common_crawl/warc.jsonl.gz")
wat = RawTextEvaluationDataset(input_path="raw/common_crawl/wat.jsonl.gz")

prefixed = raw_web_markup.prefixed_raw_web_markup_validation_sets(
{
"cc_warc_html": warc,
"cc_wat_json": wat,
}
)

assert prefixed == {
os.path.join(raw_web_markup.RAW_WEB_MARKUP_PREFIX, "cc_warc_html"): warc,
os.path.join(raw_web_markup.RAW_WEB_MARKUP_PREFIX, "cc_wat_json"): wat,
}


def test_raw_web_markup_raw_validation_sets_reads_active_registry(monkeypatch) -> None:
svg = RawTextEvaluationDataset(input_path="raw/svg_stack/svg.xml.jsonl.gz")

monkeypatch.setattr(raw_web_markup, "ACTIVE_RAW_WEB_MARKUP_DATASETS", {"svg_xml": svg})

assert raw_web_markup.raw_web_markup_raw_validation_sets() == {
os.path.join(raw_web_markup.RAW_WEB_MARKUP_PREFIX, "svg_xml"): svg
}
Loading