Add long-tail PPL gap rerun registry

dlwh · dlwh · commit dad6f61e0d26 · 2026-04-22T13:04:21.000-07:00
diff --git a/experiments/evals/long_tail_ppl.py b/experiments/evals/long_tail_ppl.py
diff --git a/experiments/evals/long_tail_ppl_runnable.py b/experiments/evals/long_tail_ppl_runnable.py
@@ -0,0 +1,113 @@
+# Copyright The Marin Authors
+# SPDX-License-Identifier: Apache-2.0
+
+"""Runnable first-pass long-tail PPL slices backed by public Hugging Face datasets."""
+
+from __future__ import annotations
+
+import posixpath
+from dataclasses import dataclass
+
+from marin.evaluation.perplexity_gap import RawTextEvaluationDataset, raw_text_dataset
+from marin.processing.tokenize import HfDatasetSpec
+
+from experiments.evals.long_tail_ppl import LongTailPplFamily
+
+RUNNABLE_LONG_TAIL_SOURCE_NOTE = (
+    "These slices are directly executable from public Hugging Face datasets and do not require a bulk mirror."
+)
+
+
+@dataclass(frozen=True)
+class RunnableLongTailPplSlice:
+    """A runnable long-tail slice backed by a small public Hugging Face dataset."""
+
+    name: str
+    family: LongTailPplFamily
+    source_url: str
+    hf_dataset: HfDatasetSpec
+    text_key: str
+    split: str
+    notes: str = ""
+
+    @property
+    def registry_key(self) -> str:
+        return posixpath.join("long_tail_ppl_runnable", self.family.value, self.name)
+
+    @property
+    def tags(self) -> tuple[str, ...]:
+        return ("long_tail_ppl", "long_tail_ppl_runnable", self.family.value, f"split:{self.split}")
+
+    def to_raw_text_dataset(self) -> RawTextEvaluationDataset:
+        return raw_text_dataset(self.hf_dataset, text_key=self.text_key, split=self.split, tags=self.tags)
+
+
+RUNNABLE_LONG_TAIL_PPL_SLICES: tuple[RunnableLongTailPplSlice, ...] = (
+    RunnableLongTailPplSlice(
+        name="svg_stack_val",
+        family=LongTailPplFamily.WEB_MARKUP_IMAGE_TEXT,
+        source_url="https://huggingface.co/datasets/starvector/svg-stack",
+        hf_dataset=HfDatasetSpec(id="starvector/svg-stack"),
+        text_key="Svg",
+        split="val",
+        notes="Preserve SVG XML and caption-adjacent markup in the validation split.",
+    ),
+    RunnableLongTailPplSlice(
+        name="svg_stack_test",
+        family=LongTailPplFamily.WEB_MARKUP_IMAGE_TEXT,
+        source_url="https://huggingface.co/datasets/starvector/svg-stack",
+        hf_dataset=HfDatasetSpec(id="starvector/svg-stack"),
+        text_key="Svg",
+        split="test",
+        notes="Preserve SVG XML in the held-out test split.",
+    ),
+    RunnableLongTailPplSlice(
+        name="verilogeval_prompt",
+        family=LongTailPplFamily.FORMAL_HARDWARE,
+        source_url="https://huggingface.co/datasets/dakies/nvlabs-verilogeval",
+        hf_dataset=HfDatasetSpec(id="dakies/nvlabs-verilogeval"),
+        text_key="prompt",
+        split="test",
+        notes="Keep VerilogEval problem statements and interface text intact.",
+    ),
+    RunnableLongTailPplSlice(
+        name="verilogeval_canonical_solution",
+        family=LongTailPplFamily.FORMAL_HARDWARE,
+        source_url="https://huggingface.co/datasets/dakies/nvlabs-verilogeval",
+        hf_dataset=HfDatasetSpec(id="dakies/nvlabs-verilogeval"),
+        text_key="canonical_solution",
+        split="test",
+        notes="Keep VerilogEval reference implementations and formatting intact.",
+    ),
+)
+
+RUNNABLE_LONG_TAIL_PPL_REGISTRY: dict[str, RunnableLongTailPplSlice] = {
+    slice_.registry_key: slice_ for slice_ in RUNNABLE_LONG_TAIL_PPL_SLICES
+}
+
+
+def runnable_long_tail_ppl_slices(*, family: LongTailPplFamily | None = None) -> tuple[RunnableLongTailPplSlice, ...]:
+    if family is None:
+        return RUNNABLE_LONG_TAIL_PPL_SLICES
+    return tuple(slice_ for slice_ in RUNNABLE_LONG_TAIL_PPL_SLICES if slice_.family == family)
+
+
+def runnable_long_tail_raw_validation_sets() -> dict[str, RawTextEvaluationDataset]:
+    """Materialize the runnable HF-backed slices into raw-text datasets."""
+
+    return {slice_.registry_key: slice_.to_raw_text_dataset() for slice_ in RUNNABLE_LONG_TAIL_PPL_SLICES}
+
+
+def render_runnable_long_tail_registry_markdown() -> str:
+    lines = ["# Runnable long-tail PPL registry", "", RUNNABLE_LONG_TAIL_SOURCE_NOTE, ""]
+    for current_family in LongTailPplFamily:
+        family_slices = runnable_long_tail_ppl_slices(family=current_family)
+        if not family_slices:
+            continue
+        lines.append(f"## {current_family.value}")
+        for slice_ in family_slices:
+            lines.append(f"- `{slice_.registry_key}`: split={slice_.split} | {slice_.text_key} | {slice_.source_url}")
+            if slice_.notes:
+                lines.append(f"  - {slice_.notes}")
+        lines.append("")
+    return "\n".join(lines).rstrip() + "\n"
diff --git a/experiments/exp_model_perplexity_gap_long_tail_runnable.py b/experiments/exp_model_perplexity_gap_long_tail_runnable.py
@@ -0,0 +1,79 @@
+# Copyright The Marin Authors
+# SPDX-License-Identifier: Apache-2.0
+
+from fray.v2.types import ResourceConfig
+
+from experiments.evals.long_tail_ppl_runnable import runnable_long_tail_raw_validation_sets
+from marin.evaluation.perplexity_gap import GapFinderModelConfig, default_model_perplexity_gap
+from marin.execution.executor import executor_main
+
+RESOURCE_CONFIG = ResourceConfig.with_tpu("v5p-8", regions=["us-central1"])
+MAX_DOCS_PER_DATASET = 256
+MAX_DOC_BYTES = 32_768
+
+DATASETS = runnable_long_tail_raw_validation_sets()
+
+MARIN_MODEL = GapFinderModelConfig(
+    checkpoint_path="marin-community/marin-8b-base",
+    checkpoint_is_hf=True,
+    tokenizer="meta-llama/Llama-3.1-8B",
+)
+
+MARIN_VS_LLAMA = default_model_perplexity_gap(
+    name="long-tail-runnable-marin-8b-base-vs-llama-3.1-8b-base-doccap256",
+    model_a=MARIN_MODEL,
+    model_b=GapFinderModelConfig(
+        checkpoint_path="meta-llama/Llama-3.1-8B",
+        checkpoint_is_hf=True,
+        tokenizer="meta-llama/Llama-3.1-8B",
+    ),
+    datasets=DATASETS,
+    resource_config=RESOURCE_CONFIG,
+    per_device_batch_size=4,
+    max_eval_length=4096,
+    max_docs_per_dataset=MAX_DOCS_PER_DATASET,
+    max_doc_bytes=MAX_DOC_BYTES,
+    wandb_tags=[
+        "eval=perplexity-gap",
+        "rerun=long-tail-runnable-first-pass",
+        "model_a=marin-community/marin-8b-base",
+        "model_b=meta-llama/Llama-3.1-8B",
+        "dataset_bundle=runnable_long_tail_hf_backed",
+        "source_split=hf_dataset",
+        "region=us-central1",
+        f"max_docs_per_dataset={MAX_DOCS_PER_DATASET}",
+    ],
+)
+
+MARIN_VS_QWEN3 = default_model_perplexity_gap(
+    name="long-tail-runnable-marin-8b-base-vs-qwen3-8b-base-doccap256",
+    model_a=MARIN_MODEL,
+    model_b=GapFinderModelConfig(
+        checkpoint_path="Qwen/Qwen3-8B-Base",
+        checkpoint_is_hf=True,
+        tokenizer="Qwen/Qwen3-8B",
+    ),
+    datasets=DATASETS,
+    resource_config=RESOURCE_CONFIG,
+    per_device_batch_size=4,
+    max_eval_length=4096,
+    max_docs_per_dataset=MAX_DOCS_PER_DATASET,
+    max_doc_bytes=MAX_DOC_BYTES,
+    wandb_tags=[
+        "eval=perplexity-gap",
+        "rerun=long-tail-runnable-first-pass",
+        "model_a=marin-community/marin-8b-base",
+        "model_b=Qwen/Qwen3-8B-Base",
+        "dataset_bundle=runnable_long_tail_hf_backed",
+        "source_split=hf_dataset",
+        "region=us-central1",
+        f"max_docs_per_dataset={MAX_DOCS_PER_DATASET}",
+    ],
+)
+
+
+if __name__ == "__main__":
+    executor_main(
+        [MARIN_VS_LLAMA, MARIN_VS_QWEN3],
+        description="Run Marin perplexity-gap reports on runnable first-pass long-tail PPL slices.",
+    )
diff --git a/experiments/exp_model_perplexity_gap_long_tail_smoke.py b/experiments/exp_model_perplexity_gap_long_tail_smoke.py
@@ -0,0 +1,48 @@
+# Copyright The Marin Authors
+# SPDX-License-Identifier: Apache-2.0
+
+from fray.v2.types import ResourceConfig
+
+from experiments.evals.long_tail_ppl_runnable import runnable_long_tail_raw_validation_sets
+from marin.evaluation.perplexity_gap import GapFinderModelConfig, default_model_perplexity_gap
+from marin.execution.executor import executor_main
+
+RESOURCE_CONFIG = ResourceConfig.with_tpu("v5p-8", regions=["us-central1"])
+MAX_DOCS_PER_DATASET = 32
+MAX_DOC_BYTES = 32_768
+
+DATASETS = runnable_long_tail_raw_validation_sets()
+
+MARIN_MODEL = GapFinderModelConfig(
+    checkpoint_path="marin-community/marin-8b-base",
+    checkpoint_is_hf=True,
+    tokenizer="meta-llama/Llama-3.1-8B",
+)
+
+STEP = default_model_perplexity_gap(
+    name="long-tail-smoke-marin-8b-base-vs-llama-3.1-8b-base-doccap32",
+    model_a=MARIN_MODEL,
+    model_b=GapFinderModelConfig(
+        checkpoint_path="meta-llama/Llama-3.1-8B",
+        checkpoint_is_hf=True,
+        tokenizer="meta-llama/Llama-3.1-8B",
+    ),
+    datasets=DATASETS,
+    resource_config=RESOURCE_CONFIG,
+    per_device_batch_size=4,
+    max_eval_length=4096,
+    max_docs_per_dataset=MAX_DOCS_PER_DATASET,
+    max_doc_bytes=MAX_DOC_BYTES,
+    wandb_tags=[
+        "eval=perplexity-gap",
+        "smoke=long-tail-ppl",
+        "source_split=hf_dataset",
+        "region=us-central1",
+        "dataset_bundle=runnable_long_tail_hf_backed",
+        f"max_docs_per_dataset={MAX_DOCS_PER_DATASET}",
+    ],
+)
+
+
+if __name__ == "__main__":
+    executor_main([STEP], description="Smoke-run runnable long-tail PPL slices from public Hugging Face datasets.")
diff --git a/lib/levanter/src/levanter/analysis/perplexity_gap.py b/lib/levanter/src/levanter/analysis/perplexity_gap.py
@@ -431,7 +431,7 @@ def iter_raw_text_documents(
                 "Gap finding currently supports TextLmDatasetFormat only."
             )
 
-        source = component.source.get_shard_source("validation")
+        source = component.source.get_shard_source(component.split)
         if source is None:
             continue
 
diff --git a/lib/levanter/src/levanter/data/text/datasets.py b/lib/levanter/src/levanter/data/text/datasets.py
@@ -335,6 +335,7 @@ class DatasetComponent(DatasetComponentBase):
     format: LmDatasetFormatBase = field(default_factory=TextLmDatasetFormat)
     pack: bool | int | Literal["pad"] | None = None
     tags: list[str] | None = None
+    split: str = "validation"
 
 
 @DatasetComponentBase.register_subclass("direct")
diff --git a/lib/marin/src/marin/evaluation/perplexity_gap.py b/lib/marin/src/marin/evaluation/perplexity_gap.py
@@ -40,6 +40,7 @@ class RawTextEvaluationDataset:
     hf_dataset_id: str | None = None
     hf_dataset_name: str | None = None
     text_key: str = "text"
+    split: str = "validation"
     tags: tuple[str, ...] = ()
 
 
@@ -63,16 +64,20 @@ def raw_text_dataset(
     source: str | InputName | ExecutorStep | HfDatasetSpec,
     *,
     text_key: str = "text",
+    split: str = "validation",
     tags: tuple[str, ...] = (),
 ) -> RawTextEvaluationDataset:
     if isinstance(source, HfDatasetSpec):
         return RawTextEvaluationDataset(
             hf_dataset_id=source.id,
             hf_dataset_name=source.name,
             text_key=text_key,
+            split=split,
             tags=tags,
         )
-    return RawTextEvaluationDataset(input_path=source, text_key=text_key, tags=tags)
+    if split != "validation":
+        raise ValueError("split is only supported for Hugging Face dataset sources; file paths use validation.")
+    return RawTextEvaluationDataset(input_path=source, text_key=text_key, split=split, tags=tags)
 
 
 def default_model_perplexity_gap(
@@ -184,10 +189,13 @@ def _to_dataset_component(config: RawTextEvaluationDataset) -> DatasetComponent:
             id=config.hf_dataset_id,
             name=config.hf_dataset_name,
             format=dataset_format,
+            splits=[config.split],
         )
     else:
         if config.input_path is None:
             raise ValueError("RawTextEvaluationDataset requires either input_path or hf_dataset_id.")
+        if config.split != "validation":
+            raise ValueError("RawTextEvaluationDataset split is only supported for Hugging Face dataset sources.")
         input_path = config.input_path
         if isinstance(input_path, ExecutorStep):
             input_path = input_path.as_input_name()
@@ -196,7 +204,7 @@ def _to_dataset_component(config: RawTextEvaluationDataset) -> DatasetComponent:
             validation_urls=[input_path],  # type: ignore[list-item]
             format=dataset_format,
         )
-    return DatasetComponent(source=source, format=dataset_format, tags=list(config.tags))
+    return DatasetComponent(source=source, format=dataset_format, tags=list(config.tags), split=config.split)
 
 
 def _default_step_name(model_a: GapFinderModelConfig, model_b: GapFinderModelConfig) -> str:
@@ -234,5 +242,6 @@ def _cache_key_for_dataset(dataset: RawTextEvaluationDataset) -> dict[str, Any]:
         "hf_dataset_id": dataset.hf_dataset_id,
         "hf_dataset_name": dataset.hf_dataset_name,
         "text_key": dataset.text_key,
+        "split": dataset.split,
         "tags": dataset.tags,
     }
diff --git a/tests/evals/test_long_tail_ppl.py b/tests/evals/test_long_tail_ppl.py

Original file line number	Diff line number	Diff line change
`@@ -431,7 +431,7 @@ def iter_raw_text_documents(`
`431`	`431`	`"Gap finding currently supports TextLmDatasetFormat only."`
`432`	`432`	`)`
`433`	`433`
`434`		`- source = component.source.get_shard_source("validation")`
	`434`	`+ source = component.source.get_shard_source(component.split)`
`435`	`435`	`if source is None:`
`436`	`436`	`continue`
`437`	`437`