[evals] Add game / music runnable PPL slices (issue #5062)

github-actions[bot] · dlwh · github-actions[bot] · commit c8887516f758 · 2026-04-22T21:45:53.000Z
Adds HF-backed runnable slices for the GAME_MUSIC family on top of the long-tail PPL runnable registry introduced in #5075: - lichess_pgn_2013_06: Icannos/lichess_games @ config 2013-06, text column carries full PGN (headers, movetext, NAGs, comments, result); CC0. - irishman_abc: sander-wood/irishman validation split, column is the literal "abc notation" string; MIT / public domain. - melodyhub_abc_input: sander-wood/melodyhub validation split, ABC body with task-tag prefixes so the gap report can separate plain-ABC PPL from ABC-with-structural-prefix PPL. Also adds a focused pilot gap-report experiment (exp_model_perplexity_gap_symbolic_notation_pilot.py) running Marin-8B against Llama-3.1-8B, Qwen3-8B-Base, and Gemma-2-9b on just the game/music slices, with max_docs_per_dataset=2048 so each slice's compressed-byte volume stays roughly Paloma-sized (PGN/ABC docs are much shorter than an average Paloma doc). Humdrum/Kern has no clean HF mirror and remains as a stub in the long_tail_ppl.py registry for a later download-step mirror. Follow-up tokenizer-axis comparison filed as #5079. Co-authored-by: David Hall <dlwh@users.noreply.github.com>
diff --git a/experiments/evals/long_tail_ppl_runnable.py b/experiments/evals/long_tail_ppl_runnable.py
@@ -79,6 +79,49 @@ def to_raw_text_dataset(self) -> RawTextEvaluationDataset:
         split="test",
         notes="Keep VerilogEval reference implementations and formatting intact.",
     ),
+    # Game / music (issue #5062)
+    #
+    # Lichess games are mirrored on HF with one full PGN per row (header tags,
+    # movetext, NAGs, comments, variations, result marker). We pin a single early
+    # month as the config so the slice is deterministic and small; the only split
+    # this corpus ships is ``train``, which we treat as a diagnostic eval split
+    # (we never train on it). License is CC0 upstream.
+    RunnableLongTailPplSlice(
+        name="lichess_pgn_2013_06",
+        family=LongTailPplFamily.GAME_MUSIC,
+        source_url="https://huggingface.co/datasets/Icannos/lichess_games",
+        hf_dataset=HfDatasetSpec(id="Icannos/lichess_games", name="2013-06"),
+        text_key="text",
+        split="train",
+        notes="Preserve PGN header tags, movetext, NAGs, comments, and result markers verbatim.",
+    ),
+    # IrishMAN ships a dedicated ``validation`` split of ABC tunes; the column
+    # name is literally ``abc notation`` (with a space). License is MIT / public
+    # domain. We keep headers (``X:``/``T:``/``M:``/``L:``/``K:``/``Q:``), bar
+    # lines, inline ``%`` comments, repeats, chord symbols, and decorations
+    # verbatim so the gap-report byte buckets can attribute tokens to each.
+    RunnableLongTailPplSlice(
+        name="irishman_abc",
+        family=LongTailPplFamily.GAME_MUSIC,
+        source_url="https://huggingface.co/datasets/sander-wood/irishman",
+        hf_dataset=HfDatasetSpec(id="sander-wood/irishman"),
+        text_key="abc notation",
+        split="validation",
+        notes="Preserve ABC headers, bar lines, repeats, chord symbols, and decorations verbatim.",
+    ),
+    # MelodyHub exposes the same ABC surface form but with a task-prefixed
+    # ``input`` column, giving a different structural flavour (task tag + ABC
+    # body) than IrishMAN. Useful as a second ABC slice to separate structural
+    # PPL from plain-ABC PPL.
+    RunnableLongTailPplSlice(
+        name="melodyhub_abc_input",
+        family=LongTailPplFamily.GAME_MUSIC,
+        source_url="https://huggingface.co/datasets/sander-wood/melodyhub",
+        hf_dataset=HfDatasetSpec(id="sander-wood/melodyhub"),
+        text_key="input",
+        split="validation",
+        notes="ABC with task-tag prefixes; keep task markers and musical body together.",
+    ),
 )
 
 RUNNABLE_LONG_TAIL_PPL_REGISTRY: dict[str, RunnableLongTailPplSlice] = {
diff --git a/experiments/exp_model_perplexity_gap_symbolic_notation_pilot.py b/experiments/exp_model_perplexity_gap_symbolic_notation_pilot.py
@@ -0,0 +1,144 @@
+# Copyright The Marin Authors
+# SPDX-License-Identifier: Apache-2.0
+
+"""Pilot perplexity-gap report for the game / music symbolic-notation slices.
+
+Scoped to issue #5062 under epic #5005. Answers the DoD question:
+
+    Do gaps concentrate in metadata headers, symbolic sequences, comments,
+    or numeric annotations?
+
+The answer comes from the gap-report's per-slice byte-bucket rollup
+(whitespace / punctuation / numbers / words), not from any post-hoc
+analysis here. This file only wires up the models and the slice subset.
+
+Unlike ``exp_model_perplexity_gap_long_tail_runnable`` which sweeps every
+runnable slice, this pilot intentionally narrows to the
+``GAME_MUSIC`` family so the report surfaces symbolic-notation behaviour
+without being dominated by larger SVG / Verilog slices.
+
+PGN and ABC docs are typically an order of magnitude shorter than an
+average Paloma document, so we raise ``max_docs_per_dataset`` well above
+the long-tail-runnable default of 256 to keep the compressed-byte budget
+comparable to a Paloma slice (per dlwh, #5062).
+"""
+
+from fray.v2.types import ResourceConfig
+
+from experiments.evals.long_tail_ppl import LongTailPplFamily
+from experiments.evals.long_tail_ppl_runnable import runnable_long_tail_ppl_slices
+from marin.evaluation.perplexity_gap import (
+    GapFinderModelConfig,
+    RawTextEvaluationDataset,
+    default_model_perplexity_gap,
+)
+from marin.execution.executor import executor_main
+
+RESOURCE_CONFIG = ResourceConfig.with_tpu("v5p-8", regions=["us-central1"])
+
+# PGN / ABC docs are much shorter than an average Paloma document, so a higher
+# doc cap keeps the compressed-byte volume per slice roughly Paloma-sized while
+# still being deterministic (HF datasets return rows in a fixed order).
+MAX_DOCS_PER_DATASET = 2048
+MAX_DOC_BYTES = 32_768
+
+
+def _game_music_datasets() -> dict[str, RawTextEvaluationDataset]:
+    return {
+        slice_.registry_key: slice_.to_raw_text_dataset()
+        for slice_ in runnable_long_tail_ppl_slices(family=LongTailPplFamily.GAME_MUSIC)
+    }
+
+
+DATASETS = _game_music_datasets()
+
+MARIN_MODEL = GapFinderModelConfig(
+    checkpoint_path="marin-community/marin-8b-base",
+    checkpoint_is_hf=True,
+    tokenizer="meta-llama/Llama-3.1-8B",
+)
+
+_COMMON_TAGS = [
+    "eval=perplexity-gap",
+    "rerun=symbolic-notation-pilot",
+    "issue=5062",
+    "epic=5005",
+    "dataset_bundle=runnable_long_tail_hf_backed",
+    "family=game_music",
+    "source_split=hf_dataset",
+    "region=us-central1",
+    f"max_docs_per_dataset={MAX_DOCS_PER_DATASET}",
+]
+
+MARIN_VS_LLAMA = default_model_perplexity_gap(
+    name="symbolic-notation-pilot-marin-8b-base-vs-llama-3.1-8b-base",
+    model_a=MARIN_MODEL,
+    model_b=GapFinderModelConfig(
+        checkpoint_path="meta-llama/Llama-3.1-8B",
+        checkpoint_is_hf=True,
+        tokenizer="meta-llama/Llama-3.1-8B",
+    ),
+    datasets=DATASETS,
+    resource_config=RESOURCE_CONFIG,
+    per_device_batch_size=4,
+    max_eval_length=4096,
+    max_docs_per_dataset=MAX_DOCS_PER_DATASET,
+    max_doc_bytes=MAX_DOC_BYTES,
+    wandb_tags=[
+        *_COMMON_TAGS,
+        "model_a=marin-community/marin-8b-base",
+        "model_b=meta-llama/Llama-3.1-8B",
+    ],
+)
+
+MARIN_VS_QWEN3 = default_model_perplexity_gap(
+    name="symbolic-notation-pilot-marin-8b-base-vs-qwen3-8b-base",
+    model_a=MARIN_MODEL,
+    model_b=GapFinderModelConfig(
+        checkpoint_path="Qwen/Qwen3-8B-Base",
+        checkpoint_is_hf=True,
+        tokenizer="Qwen/Qwen3-8B",
+    ),
+    datasets=DATASETS,
+    resource_config=RESOURCE_CONFIG,
+    per_device_batch_size=4,
+    max_eval_length=4096,
+    max_docs_per_dataset=MAX_DOCS_PER_DATASET,
+    max_doc_bytes=MAX_DOC_BYTES,
+    wandb_tags=[
+        *_COMMON_TAGS,
+        "model_a=marin-community/marin-8b-base",
+        "model_b=Qwen/Qwen3-8B-Base",
+    ],
+)
+
+# Gemma uses a distinct tokenizer (SentencePiece, 256k vocab) with very different
+# whitespace handling from Llama-3 / Qwen3. Useful for seeing whether apparent
+# gaps on whitespace-sensitive slices (kern, ABC) track with tokenizer choice.
+MARIN_VS_GEMMA2 = default_model_perplexity_gap(
+    name="symbolic-notation-pilot-marin-8b-base-vs-gemma-2-9b",
+    model_a=MARIN_MODEL,
+    model_b=GapFinderModelConfig(
+        checkpoint_path="google/gemma-2-9b",
+        checkpoint_is_hf=True,
+        tokenizer="google/gemma-2-9b",
+    ),
+    datasets=DATASETS,
+    resource_config=RESOURCE_CONFIG,
+    per_device_batch_size=4,
+    max_eval_length=4096,
+    max_docs_per_dataset=MAX_DOCS_PER_DATASET,
+    max_doc_bytes=MAX_DOC_BYTES,
+    wandb_tags=[
+        *_COMMON_TAGS,
+        "model_a=marin-community/marin-8b-base",
+        "model_b=google/gemma-2-9b",
+    ],
+)
+
+
+if __name__ == "__main__":
+    executor_main(
+        [MARIN_VS_LLAMA, MARIN_VS_QWEN3, MARIN_VS_GEMMA2],
+        description="Game / music symbolic-notation pilot perplexity-gap report (issue #5062).",
+    )
diff --git a/tests/evals/test_long_tail_ppl.py b/tests/evals/test_long_tail_ppl.py
@@ -9,6 +9,11 @@
     long_tail_raw_validation_sets,
     render_long_tail_ppl_registry_markdown,
 )
+from experiments.evals.long_tail_ppl_runnable import (
+    RUNNABLE_LONG_TAIL_PPL_REGISTRY,
+    runnable_long_tail_ppl_slices,
+    runnable_long_tail_raw_validation_sets,
+)
 from levanter.data.text import HfDatasetSourceConfig
 from marin.evaluation.perplexity_gap import _to_dataset_component, raw_text_dataset
 from marin.processing.tokenize import HfDatasetSpec
@@ -50,3 +55,44 @@ def test_hf_backed_raw_dataset_preserves_requested_split():
 def test_file_backed_raw_dataset_rejects_non_validation_split():
     with pytest.raises(ValueError, match="Hugging Face dataset sources"):
         raw_text_dataset("gs://example-bucket/eval.jsonl", split="test")
+
+
+def test_runnable_game_music_slices_are_registered():
+    game_music_slices = runnable_long_tail_ppl_slices(family=LongTailPplFamily.GAME_MUSIC)
+
+    names = {slice_.name for slice_ in game_music_slices}
+    assert {"lichess_pgn_2013_06", "irishman_abc", "melodyhub_abc_input"} <= names
+
+    pgn = RUNNABLE_LONG_TAIL_PPL_REGISTRY["long_tail_ppl_runnable/game_music/lichess_pgn_2013_06"]
+    assert pgn.hf_dataset == HfDatasetSpec(id="Icannos/lichess_games", name="2013-06")
+    assert pgn.text_key == "text"
+    # PGN only ships a ``train`` split; we still use it as a diagnostic eval.
+    assert pgn.split == "train"
+    assert "split:train" in pgn.tags
+
+    irishman = RUNNABLE_LONG_TAIL_PPL_REGISTRY["long_tail_ppl_runnable/game_music/irishman_abc"]
+    # IrishMAN's column is literally ``abc notation`` (with the space). Asserting
+    # the exact string catches drift if someone "normalizes" it.
+    assert irishman.text_key == "abc notation"
+    assert irishman.split == "validation"
+
+
+def test_runnable_game_music_datasets_round_trip_through_dataset_component():
+    datasets = runnable_long_tail_raw_validation_sets()
+
+    pgn_key = "long_tail_ppl_runnable/game_music/lichess_pgn_2013_06"
+    irishman_key = "long_tail_ppl_runnable/game_music/irishman_abc"
+
+    pgn_component = _to_dataset_component(datasets[pgn_key])
+    irishman_component = _to_dataset_component(datasets[irishman_key])
+
+    assert isinstance(pgn_component.source, HfDatasetSourceConfig)
+    assert pgn_component.source.id == "Icannos/lichess_games"
+    assert pgn_component.source.name == "2013-06"
+    assert pgn_component.source.splits == ["train"]
+    assert pgn_component.format.text_key == "text"
+
+    assert isinstance(irishman_component.source, HfDatasetSourceConfig)
+    assert irishman_component.source.id == "sander-wood/irishman"
+    assert irishman_component.source.splits == ["validation"]
+    assert irishman_component.format.text_key == "abc notation"