diff --git a/experiments/pretraining_datasets/climblab_ja.py b/experiments/pretraining_datasets/climblab_ja.py new file mode 100644 index 0000000000..9872a028d0 --- /dev/null +++ b/experiments/pretraining_datasets/climblab_ja.py @@ -0,0 +1,40 @@ +# Copyright The Marin Authors +# SPDX-License-Identifier: Apache-2.0 + +"""ClimbLab-Ja pre-training dataset tokenization. + +Download/normalize definitions live in marin.datakit.download.climblab_ja. +This file wires the normalized output into a tokenize step for experiment +pipelines. +""" + +import os.path + +from marin.datakit.download.climblab_ja import climblab_ja_normalize_steps +from marin.execution.executor import ExecutorStep, executor_main, this_output_path, versioned +from marin.processing.tokenize import TokenizeConfig, tokenize +from marin.processing.tokenize.data_configs import TokenizerStep + +from experiments.marin_models import marin_tokenizer + +_download_spec, _normalize_spec = climblab_ja_normalize_steps() + +download: ExecutorStep = _download_spec.as_executor_step() +normalized: ExecutorStep = _normalize_spec.as_executor_step() + + +def tokenize_climblab_ja(*, tokenizer: str = marin_tokenizer) -> TokenizerStep: + return ExecutorStep( + name=os.path.join("tokenized", "climblab-ja"), + fn=tokenize, + config=TokenizeConfig( + train_paths=[normalized / "outputs/main/*.parquet"], + validation_paths=versioned([]), + cache_path=this_output_path(), + tokenizer=versioned(tokenizer), + ), + ) + + +if __name__ == "__main__": + executor_main(steps=[tokenize_climblab_ja()]) diff --git a/lib/marin/src/marin/datakit/download/climblab_ja.py b/lib/marin/src/marin/datakit/download/climblab_ja.py new file mode 100644 index 0000000000..0241f83553 --- /dev/null +++ b/lib/marin/src/marin/datakit/download/climblab_ja.py @@ -0,0 +1,34 @@ +# Copyright The Marin Authors +# SPDX-License-Identifier: Apache-2.0 + +"""KantaHayashiAI/ClimbLab-Ja dataset download + normalize helpers. + +~300B Japanese tokens (480 GB parquet, 201M rows) derived from LLM-jp +Corpus v4 with semantic re-clustering — a Japanese adaptation of +NVIDIA's Nemotron-ClimbLab quality-filtering pipeline. Per-document +fields ``quality``, ``advertisement``, and the four ``*_value`` scores +are preserved through normalize so downstream consumers can re-filter +without re-deriving them. License: ODC-BY. +""" + +from marin.datakit.download.huggingface import download_hf_step +from marin.datakit.normalize import normalize_step +from marin.execution.step_spec import StepSpec + +HF_DATASET_ID = "KantaHayashiAI/ClimbLab-Ja" +HF_REVISION = "889e349" + + +def climblab_ja_normalize_steps() -> tuple[StepSpec, ...]: + """Return the ``(download, normalize)`` chain for ClimbLab-Ja.""" + download = download_hf_step( + "raw/climblab-ja", + hf_dataset_id=HF_DATASET_ID, + revision=HF_REVISION, + ) + normalize = normalize_step( + name="normalized/climblab-ja", + download=download, + file_extensions=(".parquet",), + ) + return (download, normalize) diff --git a/lib/marin/src/marin/datakit/sources.py b/lib/marin/src/marin/datakit/sources.py index 7527a9d3ae..6bf1feaad4 100644 --- a/lib/marin/src/marin/datakit/sources.py +++ b/lib/marin/src/marin/datakit/sources.py @@ -19,6 +19,7 @@ from marin.datakit.canonical.safety_pretraining import safety_pretraining_normalize_steps from marin.datakit.download.biodiversity import biodiversity_normalize_steps +from marin.datakit.download.climblab_ja import climblab_ja_normalize_steps from marin.datakit.download.coderforge import coderforge_normalize_steps from marin.datakit.download.common_pile import common_pile_normalize_steps from marin.datakit.download.davinci_dev import ( @@ -145,6 +146,7 @@ def all_sources() -> dict[str, DatakitSource]: # cp/biodiversity is carved out of common_pile (see common_pile.py) # because it needs page-stitching before normalize. ("cp/biodiversity", biodiversity_normalize_steps, 8.60), + ("climblab-ja", climblab_ja_normalize_steps, 371.92), ("coderforge", coderforge_normalize_steps, 10.29), ("davinci-dev/ctx-native", davinci_dev_ctx_native_normalize_steps, 57.57), ("davinci-dev/env-native", davinci_dev_env_native_normalize_steps, 2.58),