Skip to content

Commit c888751

Browse files
[evals] Add game / music runnable PPL slices (issue #5062)
Adds HF-backed runnable slices for the GAME_MUSIC family on top of the long-tail PPL runnable registry introduced in #5075: - lichess_pgn_2013_06: Icannos/lichess_games @ config 2013-06, text column carries full PGN (headers, movetext, NAGs, comments, result); CC0. - irishman_abc: sander-wood/irishman validation split, column is the literal "abc notation" string; MIT / public domain. - melodyhub_abc_input: sander-wood/melodyhub validation split, ABC body with task-tag prefixes so the gap report can separate plain-ABC PPL from ABC-with-structural-prefix PPL. Also adds a focused pilot gap-report experiment (exp_model_perplexity_gap_symbolic_notation_pilot.py) running Marin-8B against Llama-3.1-8B, Qwen3-8B-Base, and Gemma-2-9b on just the game/music slices, with max_docs_per_dataset=2048 so each slice's compressed-byte volume stays roughly Paloma-sized (PGN/ABC docs are much shorter than an average Paloma doc). Humdrum/Kern has no clean HF mirror and remains as a stub in the long_tail_ppl.py registry for a later download-step mirror. Follow-up tokenizer-axis comparison filed as #5079. Co-authored-by: David Hall <dlwh@users.noreply.github.com>
1 parent 9c8fcad commit c888751

3 files changed

Lines changed: 233 additions & 0 deletions

File tree

experiments/evals/long_tail_ppl_runnable.py

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,49 @@ def to_raw_text_dataset(self) -> RawTextEvaluationDataset:
7979
split="test",
8080
notes="Keep VerilogEval reference implementations and formatting intact.",
8181
),
82+
# Game / music (issue #5062)
83+
#
84+
# Lichess games are mirrored on HF with one full PGN per row (header tags,
85+
# movetext, NAGs, comments, variations, result marker). We pin a single early
86+
# month as the config so the slice is deterministic and small; the only split
87+
# this corpus ships is ``train``, which we treat as a diagnostic eval split
88+
# (we never train on it). License is CC0 upstream.
89+
RunnableLongTailPplSlice(
90+
name="lichess_pgn_2013_06",
91+
family=LongTailPplFamily.GAME_MUSIC,
92+
source_url="https://huggingface.co/datasets/Icannos/lichess_games",
93+
hf_dataset=HfDatasetSpec(id="Icannos/lichess_games", name="2013-06"),
94+
text_key="text",
95+
split="train",
96+
notes="Preserve PGN header tags, movetext, NAGs, comments, and result markers verbatim.",
97+
),
98+
# IrishMAN ships a dedicated ``validation`` split of ABC tunes; the column
99+
# name is literally ``abc notation`` (with a space). License is MIT / public
100+
# domain. We keep headers (``X:``/``T:``/``M:``/``L:``/``K:``/``Q:``), bar
101+
# lines, inline ``%`` comments, repeats, chord symbols, and decorations
102+
# verbatim so the gap-report byte buckets can attribute tokens to each.
103+
RunnableLongTailPplSlice(
104+
name="irishman_abc",
105+
family=LongTailPplFamily.GAME_MUSIC,
106+
source_url="https://huggingface.co/datasets/sander-wood/irishman",
107+
hf_dataset=HfDatasetSpec(id="sander-wood/irishman"),
108+
text_key="abc notation",
109+
split="validation",
110+
notes="Preserve ABC headers, bar lines, repeats, chord symbols, and decorations verbatim.",
111+
),
112+
# MelodyHub exposes the same ABC surface form but with a task-prefixed
113+
# ``input`` column, giving a different structural flavour (task tag + ABC
114+
# body) than IrishMAN. Useful as a second ABC slice to separate structural
115+
# PPL from plain-ABC PPL.
116+
RunnableLongTailPplSlice(
117+
name="melodyhub_abc_input",
118+
family=LongTailPplFamily.GAME_MUSIC,
119+
source_url="https://huggingface.co/datasets/sander-wood/melodyhub",
120+
hf_dataset=HfDatasetSpec(id="sander-wood/melodyhub"),
121+
text_key="input",
122+
split="validation",
123+
notes="ABC with task-tag prefixes; keep task markers and musical body together.",
124+
),
82125
)
83126

84127
RUNNABLE_LONG_TAIL_PPL_REGISTRY: dict[str, RunnableLongTailPplSlice] = {
Lines changed: 144 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,144 @@
1+
# Copyright The Marin Authors
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
"""Pilot perplexity-gap report for the game / music symbolic-notation slices.
5+
6+
Scoped to issue #5062 under epic #5005. Answers the DoD question:
7+
8+
Do gaps concentrate in metadata headers, symbolic sequences, comments,
9+
or numeric annotations?
10+
11+
The answer comes from the gap-report's per-slice byte-bucket rollup
12+
(whitespace / punctuation / numbers / words), not from any post-hoc
13+
analysis here. This file only wires up the models and the slice subset.
14+
15+
Unlike ``exp_model_perplexity_gap_long_tail_runnable`` which sweeps every
16+
runnable slice, this pilot intentionally narrows to the
17+
``GAME_MUSIC`` family so the report surfaces symbolic-notation behaviour
18+
without being dominated by larger SVG / Verilog slices.
19+
20+
PGN and ABC docs are typically an order of magnitude shorter than an
21+
average Paloma document, so we raise ``max_docs_per_dataset`` well above
22+
the long-tail-runnable default of 256 to keep the compressed-byte budget
23+
comparable to a Paloma slice (per dlwh, #5062).
24+
"""
25+
26+
from fray.v2.types import ResourceConfig
27+
28+
from experiments.evals.long_tail_ppl import LongTailPplFamily
29+
from experiments.evals.long_tail_ppl_runnable import runnable_long_tail_ppl_slices
30+
from marin.evaluation.perplexity_gap import (
31+
GapFinderModelConfig,
32+
RawTextEvaluationDataset,
33+
default_model_perplexity_gap,
34+
)
35+
from marin.execution.executor import executor_main
36+
37+
RESOURCE_CONFIG = ResourceConfig.with_tpu("v5p-8", regions=["us-central1"])
38+
39+
# PGN / ABC docs are much shorter than an average Paloma document, so a higher
40+
# doc cap keeps the compressed-byte volume per slice roughly Paloma-sized while
41+
# still being deterministic (HF datasets return rows in a fixed order).
42+
MAX_DOCS_PER_DATASET = 2048
43+
MAX_DOC_BYTES = 32_768
44+
45+
46+
def _game_music_datasets() -> dict[str, RawTextEvaluationDataset]:
47+
return {
48+
slice_.registry_key: slice_.to_raw_text_dataset()
49+
for slice_ in runnable_long_tail_ppl_slices(family=LongTailPplFamily.GAME_MUSIC)
50+
}
51+
52+
53+
DATASETS = _game_music_datasets()
54+
55+
MARIN_MODEL = GapFinderModelConfig(
56+
checkpoint_path="marin-community/marin-8b-base",
57+
checkpoint_is_hf=True,
58+
tokenizer="meta-llama/Llama-3.1-8B",
59+
)
60+
61+
_COMMON_TAGS = [
62+
"eval=perplexity-gap",
63+
"rerun=symbolic-notation-pilot",
64+
"issue=5062",
65+
"epic=5005",
66+
"dataset_bundle=runnable_long_tail_hf_backed",
67+
"family=game_music",
68+
"source_split=hf_dataset",
69+
"region=us-central1",
70+
f"max_docs_per_dataset={MAX_DOCS_PER_DATASET}",
71+
]
72+
73+
MARIN_VS_LLAMA = default_model_perplexity_gap(
74+
name="symbolic-notation-pilot-marin-8b-base-vs-llama-3.1-8b-base",
75+
model_a=MARIN_MODEL,
76+
model_b=GapFinderModelConfig(
77+
checkpoint_path="meta-llama/Llama-3.1-8B",
78+
checkpoint_is_hf=True,
79+
tokenizer="meta-llama/Llama-3.1-8B",
80+
),
81+
datasets=DATASETS,
82+
resource_config=RESOURCE_CONFIG,
83+
per_device_batch_size=4,
84+
max_eval_length=4096,
85+
max_docs_per_dataset=MAX_DOCS_PER_DATASET,
86+
max_doc_bytes=MAX_DOC_BYTES,
87+
wandb_tags=[
88+
*_COMMON_TAGS,
89+
"model_a=marin-community/marin-8b-base",
90+
"model_b=meta-llama/Llama-3.1-8B",
91+
],
92+
)
93+
94+
MARIN_VS_QWEN3 = default_model_perplexity_gap(
95+
name="symbolic-notation-pilot-marin-8b-base-vs-qwen3-8b-base",
96+
model_a=MARIN_MODEL,
97+
model_b=GapFinderModelConfig(
98+
checkpoint_path="Qwen/Qwen3-8B-Base",
99+
checkpoint_is_hf=True,
100+
tokenizer="Qwen/Qwen3-8B",
101+
),
102+
datasets=DATASETS,
103+
resource_config=RESOURCE_CONFIG,
104+
per_device_batch_size=4,
105+
max_eval_length=4096,
106+
max_docs_per_dataset=MAX_DOCS_PER_DATASET,
107+
max_doc_bytes=MAX_DOC_BYTES,
108+
wandb_tags=[
109+
*_COMMON_TAGS,
110+
"model_a=marin-community/marin-8b-base",
111+
"model_b=Qwen/Qwen3-8B-Base",
112+
],
113+
)
114+
115+
# Gemma uses a distinct tokenizer (SentencePiece, 256k vocab) with very different
116+
# whitespace handling from Llama-3 / Qwen3. Useful for seeing whether apparent
117+
# gaps on whitespace-sensitive slices (kern, ABC) track with tokenizer choice.
118+
MARIN_VS_GEMMA2 = default_model_perplexity_gap(
119+
name="symbolic-notation-pilot-marin-8b-base-vs-gemma-2-9b",
120+
model_a=MARIN_MODEL,
121+
model_b=GapFinderModelConfig(
122+
checkpoint_path="google/gemma-2-9b",
123+
checkpoint_is_hf=True,
124+
tokenizer="google/gemma-2-9b",
125+
),
126+
datasets=DATASETS,
127+
resource_config=RESOURCE_CONFIG,
128+
per_device_batch_size=4,
129+
max_eval_length=4096,
130+
max_docs_per_dataset=MAX_DOCS_PER_DATASET,
131+
max_doc_bytes=MAX_DOC_BYTES,
132+
wandb_tags=[
133+
*_COMMON_TAGS,
134+
"model_a=marin-community/marin-8b-base",
135+
"model_b=google/gemma-2-9b",
136+
],
137+
)
138+
139+
140+
if __name__ == "__main__":
141+
executor_main(
142+
[MARIN_VS_LLAMA, MARIN_VS_QWEN3, MARIN_VS_GEMMA2],
143+
description="Game / music symbolic-notation pilot perplexity-gap report (issue #5062).",
144+
)

tests/evals/test_long_tail_ppl.py

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,11 @@
99
long_tail_raw_validation_sets,
1010
render_long_tail_ppl_registry_markdown,
1111
)
12+
from experiments.evals.long_tail_ppl_runnable import (
13+
RUNNABLE_LONG_TAIL_PPL_REGISTRY,
14+
runnable_long_tail_ppl_slices,
15+
runnable_long_tail_raw_validation_sets,
16+
)
1217
from levanter.data.text import HfDatasetSourceConfig
1318
from marin.evaluation.perplexity_gap import _to_dataset_component, raw_text_dataset
1419
from marin.processing.tokenize import HfDatasetSpec
@@ -50,3 +55,44 @@ def test_hf_backed_raw_dataset_preserves_requested_split():
5055
def test_file_backed_raw_dataset_rejects_non_validation_split():
5156
with pytest.raises(ValueError, match="Hugging Face dataset sources"):
5257
raw_text_dataset("gs://example-bucket/eval.jsonl", split="test")
58+
59+
60+
def test_runnable_game_music_slices_are_registered():
61+
game_music_slices = runnable_long_tail_ppl_slices(family=LongTailPplFamily.GAME_MUSIC)
62+
63+
names = {slice_.name for slice_ in game_music_slices}
64+
assert {"lichess_pgn_2013_06", "irishman_abc", "melodyhub_abc_input"} <= names
65+
66+
pgn = RUNNABLE_LONG_TAIL_PPL_REGISTRY["long_tail_ppl_runnable/game_music/lichess_pgn_2013_06"]
67+
assert pgn.hf_dataset == HfDatasetSpec(id="Icannos/lichess_games", name="2013-06")
68+
assert pgn.text_key == "text"
69+
# PGN only ships a ``train`` split; we still use it as a diagnostic eval.
70+
assert pgn.split == "train"
71+
assert "split:train" in pgn.tags
72+
73+
irishman = RUNNABLE_LONG_TAIL_PPL_REGISTRY["long_tail_ppl_runnable/game_music/irishman_abc"]
74+
# IrishMAN's column is literally ``abc notation`` (with the space). Asserting
75+
# the exact string catches drift if someone "normalizes" it.
76+
assert irishman.text_key == "abc notation"
77+
assert irishman.split == "validation"
78+
79+
80+
def test_runnable_game_music_datasets_round_trip_through_dataset_component():
81+
datasets = runnable_long_tail_raw_validation_sets()
82+
83+
pgn_key = "long_tail_ppl_runnable/game_music/lichess_pgn_2013_06"
84+
irishman_key = "long_tail_ppl_runnable/game_music/irishman_abc"
85+
86+
pgn_component = _to_dataset_component(datasets[pgn_key])
87+
irishman_component = _to_dataset_component(datasets[irishman_key])
88+
89+
assert isinstance(pgn_component.source, HfDatasetSourceConfig)
90+
assert pgn_component.source.id == "Icannos/lichess_games"
91+
assert pgn_component.source.name == "2013-06"
92+
assert pgn_component.source.splits == ["train"]
93+
assert pgn_component.format.text_key == "text"
94+
95+
assert isinstance(irishman_component.source, HfDatasetSourceConfig)
96+
assert irishman_component.source.id == "sander-wood/irishman"
97+
assert irishman_component.source.splits == ["validation"]
98+
assert irishman_component.format.text_key == "abc notation"

0 commit comments

Comments
 (0)