Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions experiments/evals/fineweb2_multilingual.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

from experiments.defaults import default_tokenize
from experiments.llama import llama3_tokenizer
from marin.evaluation.perplexity_gap import RawTextEvaluationDataset, raw_text_dataset
from marin.execution.executor import executor_main
from marin.processing.tokenize.data_configs import TokenizerStep

Expand Down Expand Up @@ -190,6 +191,21 @@ def fineweb2_multilingual_eval_bundle(*, tokenizer: str = llama3_tokenizer) -> d
)


def fineweb2_multilingual_raw_validation_sets(
*,
configs: Sequence[str] = FINEWEB2_MULTILINGUAL_EVAL_CONFIGS,
name_prefix: str = "fineweb2_multilingual",
) -> dict[str, RawTextEvaluationDataset]:
"""Return raw FineWeb2 multilingual held-out eval sets for perplexity-gap reports."""
return {
os.path.join(name_prefix, config): raw_text_dataset(
fineweb2_multilingual_parquet_pattern(config, FINEWEB2_EVAL_SPLIT),
tags=tuple(fineweb2_multilingual_tags(config)),
)
for config in configs
}


if __name__ == "__main__":
executor_main(
steps=list(fineweb2_multilingual_eval_bundle().values()),
Expand Down
81 changes: 81 additions & 0 deletions experiments/exp_model_perplexity_gap_fineweb2_multilingual.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
# Copyright The Marin Authors
# SPDX-License-Identifier: Apache-2.0

from fray.v2.types import ResourceConfig

from experiments.defaults import default_raw_validation_sets
from experiments.evals.fineweb2_multilingual import fineweb2_multilingual_raw_validation_sets
from marin.evaluation.perplexity_gap import GapFinderModelConfig, default_model_perplexity_gap
from marin.execution.executor import executor_main

RESOURCE_CONFIG = ResourceConfig.with_tpu("v5p-8", regions=["us-central1"])
MAX_DOCS_PER_DATASET = 256
MAX_DOC_BYTES = 32_768

DATASETS = {
**default_raw_validation_sets(),
**fineweb2_multilingual_raw_validation_sets(),
}

MARIN_MODEL = GapFinderModelConfig(
checkpoint_path="marin-community/marin-8b-base",
checkpoint_is_hf=True,
tokenizer="meta-llama/Llama-3.1-8B",
)

MARIN_VS_LLAMA = default_model_perplexity_gap(
name="fineweb2-multilingual-marin-8b-base-vs-llama-3.1-8b-base-doccap256",
model_a=MARIN_MODEL,
model_b=GapFinderModelConfig(
checkpoint_path="meta-llama/Llama-3.1-8B",
checkpoint_is_hf=True,
tokenizer="meta-llama/Llama-3.1-8B",
),
datasets=DATASETS,
resource_config=RESOURCE_CONFIG,
per_device_batch_size=4,
max_eval_length=4096,
max_docs_per_dataset=MAX_DOCS_PER_DATASET,
max_doc_bytes=MAX_DOC_BYTES,
wandb_tags=[
"eval=perplexity-gap",
"rerun=fineweb2-multilingual",
"model_a=marin-community/marin-8b-base",
"model_b=meta-llama/Llama-3.1-8B",
"dataset_bundle=default_raw_plus_fineweb2_multilingual",
"region=us-central1",
f"max_docs_per_dataset={MAX_DOCS_PER_DATASET}",
],
)

MARIN_VS_QWEN3 = default_model_perplexity_gap(
name="fineweb2-multilingual-marin-8b-base-vs-qwen3-8b-base-doccap256",
model_a=MARIN_MODEL,
model_b=GapFinderModelConfig(
checkpoint_path="Qwen/Qwen3-8B-Base",
checkpoint_is_hf=True,
tokenizer="Qwen/Qwen3-8B",
),
datasets=DATASETS,
resource_config=RESOURCE_CONFIG,
per_device_batch_size=4,
max_eval_length=4096,
max_docs_per_dataset=MAX_DOCS_PER_DATASET,
max_doc_bytes=MAX_DOC_BYTES,
wandb_tags=[
"eval=perplexity-gap",
"rerun=fineweb2-multilingual",
"model_a=marin-community/marin-8b-base",
"model_b=Qwen/Qwen3-8B-Base",
"dataset_bundle=default_raw_plus_fineweb2_multilingual",
"region=us-central1",
f"max_docs_per_dataset={MAX_DOCS_PER_DATASET}",
],
)


if __name__ == "__main__":
executor_main(
[MARIN_VS_LLAMA, MARIN_VS_QWEN3],
description="Run Marin perplexity-gap reports with FineWeb2 multilingual held-out eval sets.",
)
19 changes: 19 additions & 0 deletions tests/evals/test_fineweb2_multilingual.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# Copyright The Marin Authors
# SPDX-License-Identifier: Apache-2.0

from experiments.evals.fineweb2_multilingual import fineweb2_multilingual_raw_validation_sets


def test_fineweb2_multilingual_raw_sets_use_eval_split_in_path_only():
datasets = fineweb2_multilingual_raw_validation_sets(configs=("deu_Latn",))

dataset = datasets["fineweb2_multilingual/deu_Latn"]

assert isinstance(dataset.input_path, str)
assert "/deu_Latn/test/*.parquet" in dataset.input_path
assert dataset.tags == (
"fineweb2_multilingual",
"fineweb2_multilingual/script/Latn",
"fineweb2_multilingual/language/deu",
"fineweb2_multilingual/top_50_by_rows",
)
Loading