Skip to content

Commit 15a2585

Browse files
committed
Add FineWeb2 multilingual gap rerun entrypoint
1 parent eacfcf2 commit 15a2585

3 files changed

Lines changed: 117 additions & 0 deletions

File tree

experiments/evals/fineweb2_multilingual.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414

1515
from experiments.defaults import default_tokenize
1616
from experiments.llama import llama3_tokenizer
17+
from marin.evaluation.perplexity_gap import RawTextEvaluationDataset, raw_text_dataset
1718
from marin.execution.executor import executor_main
1819
from marin.processing.tokenize.data_configs import TokenizerStep
1920

@@ -190,6 +191,21 @@ def fineweb2_multilingual_eval_bundle(*, tokenizer: str = llama3_tokenizer) -> d
190191
)
191192

192193

194+
def fineweb2_multilingual_raw_validation_sets(
195+
*,
196+
configs: Sequence[str] = FINEWEB2_MULTILINGUAL_EVAL_CONFIGS,
197+
name_prefix: str = "fineweb2_multilingual",
198+
) -> dict[str, RawTextEvaluationDataset]:
199+
"""Return raw FineWeb2 multilingual held-out eval sets for perplexity-gap reports."""
200+
return {
201+
os.path.join(name_prefix, config): raw_text_dataset(
202+
fineweb2_multilingual_parquet_pattern(config, FINEWEB2_EVAL_SPLIT),
203+
tags=tuple(fineweb2_multilingual_tags(config)),
204+
)
205+
for config in configs
206+
}
207+
208+
193209
if __name__ == "__main__":
194210
executor_main(
195211
steps=list(fineweb2_multilingual_eval_bundle().values()),
Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
# Copyright The Marin Authors
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
from fray.v2.types import ResourceConfig
5+
6+
from experiments.defaults import default_raw_validation_sets
7+
from experiments.evals.fineweb2_multilingual import fineweb2_multilingual_raw_validation_sets
8+
from marin.evaluation.perplexity_gap import GapFinderModelConfig, default_model_perplexity_gap
9+
from marin.execution.executor import executor_main
10+
11+
RESOURCE_CONFIG = ResourceConfig.with_tpu("v5p-8", regions=["us-central1"])
12+
MAX_DOCS_PER_DATASET = 256
13+
MAX_DOC_BYTES = 32_768
14+
15+
DATASETS = {
16+
**default_raw_validation_sets(),
17+
**fineweb2_multilingual_raw_validation_sets(),
18+
}
19+
20+
MARIN_MODEL = GapFinderModelConfig(
21+
checkpoint_path="marin-community/marin-8b-base",
22+
checkpoint_is_hf=True,
23+
tokenizer="meta-llama/Llama-3.1-8B",
24+
)
25+
26+
MARIN_VS_LLAMA = default_model_perplexity_gap(
27+
name="fineweb2-multilingual-marin-8b-base-vs-llama-3.1-8b-base-doccap256",
28+
model_a=MARIN_MODEL,
29+
model_b=GapFinderModelConfig(
30+
checkpoint_path="meta-llama/Llama-3.1-8B",
31+
checkpoint_is_hf=True,
32+
tokenizer="meta-llama/Llama-3.1-8B",
33+
),
34+
datasets=DATASETS,
35+
resource_config=RESOURCE_CONFIG,
36+
per_device_batch_size=4,
37+
max_eval_length=4096,
38+
max_docs_per_dataset=MAX_DOCS_PER_DATASET,
39+
max_doc_bytes=MAX_DOC_BYTES,
40+
wandb_tags=[
41+
"eval=perplexity-gap",
42+
"rerun=fineweb2-multilingual",
43+
"model_a=marin-community/marin-8b-base",
44+
"model_b=meta-llama/Llama-3.1-8B",
45+
"dataset_bundle=default_raw_plus_fineweb2_multilingual",
46+
"region=us-central1",
47+
f"max_docs_per_dataset={MAX_DOCS_PER_DATASET}",
48+
],
49+
)
50+
51+
MARIN_VS_QWEN3 = default_model_perplexity_gap(
52+
name="fineweb2-multilingual-marin-8b-base-vs-qwen3-8b-base-doccap256",
53+
model_a=MARIN_MODEL,
54+
model_b=GapFinderModelConfig(
55+
checkpoint_path="Qwen/Qwen3-8B-Base",
56+
checkpoint_is_hf=True,
57+
tokenizer="Qwen/Qwen3-8B",
58+
),
59+
datasets=DATASETS,
60+
resource_config=RESOURCE_CONFIG,
61+
per_device_batch_size=4,
62+
max_eval_length=4096,
63+
max_docs_per_dataset=MAX_DOCS_PER_DATASET,
64+
max_doc_bytes=MAX_DOC_BYTES,
65+
wandb_tags=[
66+
"eval=perplexity-gap",
67+
"rerun=fineweb2-multilingual",
68+
"model_a=marin-community/marin-8b-base",
69+
"model_b=Qwen/Qwen3-8B-Base",
70+
"dataset_bundle=default_raw_plus_fineweb2_multilingual",
71+
"region=us-central1",
72+
f"max_docs_per_dataset={MAX_DOCS_PER_DATASET}",
73+
],
74+
)
75+
76+
77+
if __name__ == "__main__":
78+
executor_main(
79+
[MARIN_VS_LLAMA, MARIN_VS_QWEN3],
80+
description="Run Marin perplexity-gap reports with FineWeb2 multilingual held-out eval sets.",
81+
)
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
# Copyright The Marin Authors
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
from experiments.evals.fineweb2_multilingual import fineweb2_multilingual_raw_validation_sets
5+
6+
7+
def test_fineweb2_multilingual_raw_sets_use_eval_split_in_path_only():
8+
datasets = fineweb2_multilingual_raw_validation_sets(configs=("deu_Latn",))
9+
10+
dataset = datasets["fineweb2_multilingual/deu_Latn"]
11+
12+
assert isinstance(dataset.input_path, str)
13+
assert "/deu_Latn/test/*.parquet" in dataset.input_path
14+
assert dataset.split == "validation"
15+
assert dataset.tags == (
16+
"fineweb2_multilingual",
17+
"fineweb2_multilingual/script/Latn",
18+
"fineweb2_multilingual/language/deu",
19+
"fineweb2_multilingual/top_50_by_rows",
20+
)

0 commit comments

Comments
 (0)