Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 43 additions & 0 deletions experiments/evals/long_tail_ppl_runnable.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,49 @@ def to_raw_text_dataset(self) -> RawTextEvaluationDataset:
split="test",
notes="Keep VerilogEval reference implementations and formatting intact.",
),
# Game / music (issue #5062)
#
# Lichess games are mirrored on HF with one full PGN per row (header tags,
# movetext, NAGs, comments, variations, result marker). We pin a single early
# month as the config so the slice is deterministic and small; the only split
# this corpus ships is ``train``, which we treat as a diagnostic eval split
# (we never train on it). License is CC0 upstream.
RunnableLongTailPplSlice(
name="lichess_pgn_2013_06",
family=LongTailPplFamily.GAME_MUSIC,
source_url="https://huggingface.co/datasets/Icannos/lichess_games",
hf_dataset=HfDatasetSpec(id="Icannos/lichess_games", name="2013-06"),
text_key="text",
split="train",
notes="Preserve PGN header tags, movetext, NAGs, comments, and result markers verbatim.",
),
# IrishMAN ships a dedicated ``validation`` split of ABC tunes; the column
# name is literally ``abc notation`` (with a space). License is MIT / public
# domain. We keep headers (``X:``/``T:``/``M:``/``L:``/``K:``/``Q:``), bar
# lines, inline ``%`` comments, repeats, chord symbols, and decorations
# verbatim so the gap-report byte buckets can attribute tokens to each.
RunnableLongTailPplSlice(
name="irishman_abc",
family=LongTailPplFamily.GAME_MUSIC,
source_url="https://huggingface.co/datasets/sander-wood/irishman",
hf_dataset=HfDatasetSpec(id="sander-wood/irishman"),
text_key="abc notation",
split="validation",
notes="Preserve ABC headers, bar lines, repeats, chord symbols, and decorations verbatim.",
),
# MelodyHub exposes the same ABC surface form but with a task-prefixed
# ``input`` column, giving a different structural flavour (task tag + ABC
# body) than IrishMAN. Useful as a second ABC slice to separate structural
# PPL from plain-ABC PPL.
RunnableLongTailPplSlice(
name="melodyhub_abc_input",
family=LongTailPplFamily.GAME_MUSIC,
source_url="https://huggingface.co/datasets/sander-wood/melodyhub",
hf_dataset=HfDatasetSpec(id="sander-wood/melodyhub"),
text_key="input",
split="validation",
notes="ABC with task-tag prefixes; keep task markers and musical body together.",
),
)

RUNNABLE_LONG_TAIL_PPL_REGISTRY: dict[str, RunnableLongTailPplSlice] = {
Expand Down
144 changes: 144 additions & 0 deletions experiments/exp_model_perplexity_gap_symbolic_notation_pilot.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
# Copyright The Marin Authors
# SPDX-License-Identifier: Apache-2.0

"""Pilot perplexity-gap report for the game / music symbolic-notation slices.

Scoped to issue #5062 under epic #5005. Answers the DoD question:

Do gaps concentrate in metadata headers, symbolic sequences, comments,
or numeric annotations?

The answer comes from the gap-report's per-slice byte-bucket rollup
(whitespace / punctuation / numbers / words), not from any post-hoc
analysis here. This file only wires up the models and the slice subset.

Unlike ``exp_model_perplexity_gap_long_tail_runnable`` which sweeps every
runnable slice, this pilot intentionally narrows to the
``GAME_MUSIC`` family so the report surfaces symbolic-notation behaviour
without being dominated by larger SVG / Verilog slices.

PGN and ABC docs are typically an order of magnitude shorter than an
average Paloma document, so we raise ``max_docs_per_dataset`` well above
the long-tail-runnable default of 256 to keep the compressed-byte budget
comparable to a Paloma slice (per dlwh, #5062).
"""

from fray.v2.types import ResourceConfig

from experiments.evals.long_tail_ppl import LongTailPplFamily
from experiments.evals.long_tail_ppl_runnable import runnable_long_tail_ppl_slices
from marin.evaluation.perplexity_gap import (
GapFinderModelConfig,
RawTextEvaluationDataset,
default_model_perplexity_gap,
)
from marin.execution.executor import executor_main

RESOURCE_CONFIG = ResourceConfig.with_tpu("v5p-8", regions=["us-central1"])

# PGN / ABC docs are much shorter than an average Paloma document, so a higher
# doc cap keeps the compressed-byte volume per slice roughly Paloma-sized while
# still being deterministic (HF datasets return rows in a fixed order).
MAX_DOCS_PER_DATASET = 2048
MAX_DOC_BYTES = 32_768


def _game_music_datasets() -> dict[str, RawTextEvaluationDataset]:
return {
slice_.registry_key: slice_.to_raw_text_dataset()
for slice_ in runnable_long_tail_ppl_slices(family=LongTailPplFamily.GAME_MUSIC)
}


DATASETS = _game_music_datasets()

MARIN_MODEL = GapFinderModelConfig(
checkpoint_path="marin-community/marin-8b-base",
checkpoint_is_hf=True,
tokenizer="meta-llama/Llama-3.1-8B",
)

_COMMON_TAGS = [
"eval=perplexity-gap",
"rerun=symbolic-notation-pilot",
"issue=5062",
"epic=5005",
"dataset_bundle=runnable_long_tail_hf_backed",
"family=game_music",
"source_split=hf_dataset",
"region=us-central1",
f"max_docs_per_dataset={MAX_DOCS_PER_DATASET}",
]

MARIN_VS_LLAMA = default_model_perplexity_gap(
name="symbolic-notation-pilot-marin-8b-base-vs-llama-3.1-8b-base",
model_a=MARIN_MODEL,
model_b=GapFinderModelConfig(
checkpoint_path="meta-llama/Llama-3.1-8B",
checkpoint_is_hf=True,
tokenizer="meta-llama/Llama-3.1-8B",
),
datasets=DATASETS,
resource_config=RESOURCE_CONFIG,
per_device_batch_size=4,
max_eval_length=4096,
max_docs_per_dataset=MAX_DOCS_PER_DATASET,
max_doc_bytes=MAX_DOC_BYTES,
wandb_tags=[
*_COMMON_TAGS,
"model_a=marin-community/marin-8b-base",
"model_b=meta-llama/Llama-3.1-8B",
],
)

MARIN_VS_QWEN3 = default_model_perplexity_gap(
name="symbolic-notation-pilot-marin-8b-base-vs-qwen3-8b-base",
model_a=MARIN_MODEL,
model_b=GapFinderModelConfig(
checkpoint_path="Qwen/Qwen3-8B-Base",
checkpoint_is_hf=True,
tokenizer="Qwen/Qwen3-8B",
),
datasets=DATASETS,
resource_config=RESOURCE_CONFIG,
per_device_batch_size=4,
max_eval_length=4096,
max_docs_per_dataset=MAX_DOCS_PER_DATASET,
max_doc_bytes=MAX_DOC_BYTES,
wandb_tags=[
*_COMMON_TAGS,
"model_a=marin-community/marin-8b-base",
"model_b=Qwen/Qwen3-8B-Base",
],
)

# Gemma uses a distinct tokenizer (SentencePiece, 256k vocab) with very different
# whitespace handling from Llama-3 / Qwen3. Useful for seeing whether apparent
# gaps on whitespace-sensitive slices (kern, ABC) track with tokenizer choice.
MARIN_VS_GEMMA2 = default_model_perplexity_gap(
name="symbolic-notation-pilot-marin-8b-base-vs-gemma-2-9b",
model_a=MARIN_MODEL,
model_b=GapFinderModelConfig(
checkpoint_path="google/gemma-2-9b",
checkpoint_is_hf=True,
tokenizer="google/gemma-2-9b",
),
datasets=DATASETS,
resource_config=RESOURCE_CONFIG,
per_device_batch_size=4,
max_eval_length=4096,
max_docs_per_dataset=MAX_DOCS_PER_DATASET,
max_doc_bytes=MAX_DOC_BYTES,
wandb_tags=[
*_COMMON_TAGS,
"model_a=marin-community/marin-8b-base",
"model_b=google/gemma-2-9b",
],
)


if __name__ == "__main__":
executor_main(
[MARIN_VS_LLAMA, MARIN_VS_QWEN3, MARIN_VS_GEMMA2],
description="Game / music symbolic-notation pilot perplexity-gap report (issue #5062).",
)
46 changes: 46 additions & 0 deletions tests/evals/test_long_tail_ppl.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,11 @@
long_tail_raw_validation_sets,
render_long_tail_ppl_registry_markdown,
)
from experiments.evals.long_tail_ppl_runnable import (
RUNNABLE_LONG_TAIL_PPL_REGISTRY,
runnable_long_tail_ppl_slices,
runnable_long_tail_raw_validation_sets,
)
from levanter.data.text import HfDatasetSourceConfig
from marin.evaluation.perplexity_gap import _to_dataset_component, raw_text_dataset
from marin.processing.tokenize import HfDatasetSpec
Expand Down Expand Up @@ -50,3 +55,44 @@ def test_hf_backed_raw_dataset_preserves_requested_split():
def test_file_backed_raw_dataset_rejects_non_validation_split():
with pytest.raises(ValueError, match="Hugging Face dataset sources"):
raw_text_dataset("gs://example-bucket/eval.jsonl", split="test")


def test_runnable_game_music_slices_are_registered():
game_music_slices = runnable_long_tail_ppl_slices(family=LongTailPplFamily.GAME_MUSIC)

names = {slice_.name for slice_ in game_music_slices}
assert {"lichess_pgn_2013_06", "irishman_abc", "melodyhub_abc_input"} <= names

pgn = RUNNABLE_LONG_TAIL_PPL_REGISTRY["long_tail_ppl_runnable/game_music/lichess_pgn_2013_06"]
assert pgn.hf_dataset == HfDatasetSpec(id="Icannos/lichess_games", name="2013-06")
assert pgn.text_key == "text"
# PGN only ships a ``train`` split; we still use it as a diagnostic eval.
assert pgn.split == "train"
assert "split:train" in pgn.tags

irishman = RUNNABLE_LONG_TAIL_PPL_REGISTRY["long_tail_ppl_runnable/game_music/irishman_abc"]
# IrishMAN's column is literally ``abc notation`` (with the space). Asserting
# the exact string catches drift if someone "normalizes" it.
assert irishman.text_key == "abc notation"
assert irishman.split == "validation"


def test_runnable_game_music_datasets_round_trip_through_dataset_component():
datasets = runnable_long_tail_raw_validation_sets()

pgn_key = "long_tail_ppl_runnable/game_music/lichess_pgn_2013_06"
irishman_key = "long_tail_ppl_runnable/game_music/irishman_abc"

pgn_component = _to_dataset_component(datasets[pgn_key])
irishman_component = _to_dataset_component(datasets[irishman_key])

assert isinstance(pgn_component.source, HfDatasetSourceConfig)
assert pgn_component.source.id == "Icannos/lichess_games"
assert pgn_component.source.name == "2013-06"
assert pgn_component.source.splits == ["train"]
assert pgn_component.format.text_key == "text"

assert isinstance(irishman_component.source, HfDatasetSourceConfig)
assert irishman_component.source.id == "sander-wood/irishman"
assert irishman_component.source.splits == ["validation"]
assert irishman_component.format.text_key == "abc notation"
Loading