From 59de36298b53176f1e2325e6553866e3edabc7c5 Mon Sep 17 00:00:00 2001 From: "claude[bot]" <41898282+claude[bot]@users.noreply.github.com> Date: Thu, 14 May 2026 17:39:31 +0000 Subject: [PATCH 1/3] Add KantaHayashiAI/ClimbLab-Ja as a Datakit source MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Registers climblab-ja (~300B Japanese tokens, 201M rows / 480 GB parquet) as a single-source datakit entry. Derived from LLM-jp Corpus v4 with Nemotron-ClimbLab–style semantic clustering and per-document quality / value scores; those columns pass through normalize so downstream consumers can re-filter without re-deriving them. License: ODC-BY. --- .../src/marin/datakit/download/climblab_ja.py | 34 +++++++++++++++++++ lib/marin/src/marin/datakit/sources.py | 2 ++ 2 files changed, 36 insertions(+) create mode 100644 lib/marin/src/marin/datakit/download/climblab_ja.py diff --git a/lib/marin/src/marin/datakit/download/climblab_ja.py b/lib/marin/src/marin/datakit/download/climblab_ja.py new file mode 100644 index 0000000000..0241f83553 --- /dev/null +++ b/lib/marin/src/marin/datakit/download/climblab_ja.py @@ -0,0 +1,34 @@ +# Copyright The Marin Authors +# SPDX-License-Identifier: Apache-2.0 + +"""KantaHayashiAI/ClimbLab-Ja dataset download + normalize helpers. + +~300B Japanese tokens (480 GB parquet, 201M rows) derived from LLM-jp +Corpus v4 with semantic re-clustering — a Japanese adaptation of +NVIDIA's Nemotron-ClimbLab quality-filtering pipeline. Per-document +fields ``quality``, ``advertisement``, and the four ``*_value`` scores +are preserved through normalize so downstream consumers can re-filter +without re-deriving them. License: ODC-BY. +""" + +from marin.datakit.download.huggingface import download_hf_step +from marin.datakit.normalize import normalize_step +from marin.execution.step_spec import StepSpec + +HF_DATASET_ID = "KantaHayashiAI/ClimbLab-Ja" +HF_REVISION = "889e349" + + +def climblab_ja_normalize_steps() -> tuple[StepSpec, ...]: + """Return the ``(download, normalize)`` chain for ClimbLab-Ja.""" + download = download_hf_step( + "raw/climblab-ja", + hf_dataset_id=HF_DATASET_ID, + revision=HF_REVISION, + ) + normalize = normalize_step( + name="normalized/climblab-ja", + download=download, + file_extensions=(".parquet",), + ) + return (download, normalize) diff --git a/lib/marin/src/marin/datakit/sources.py b/lib/marin/src/marin/datakit/sources.py index 9741542bdc..15ed99be7f 100644 --- a/lib/marin/src/marin/datakit/sources.py +++ b/lib/marin/src/marin/datakit/sources.py @@ -18,6 +18,7 @@ from functools import cache from marin.datakit.download.biodiversity import biodiversity_normalize_steps +from marin.datakit.download.climblab_ja import climblab_ja_normalize_steps from marin.datakit.download.coderforge import coderforge_normalize_steps from marin.datakit.download.common_pile import common_pile_normalize_steps from marin.datakit.download.davinci_dev import ( @@ -144,6 +145,7 @@ def all_sources() -> dict[str, DatakitSource]: # cp/biodiversity is carved out of common_pile (see common_pile.py) # because it needs page-stitching before normalize. ("cp/biodiversity", biodiversity_normalize_steps, 8.60), + ("climblab-ja", climblab_ja_normalize_steps, 300.0), ("coderforge", coderforge_normalize_steps, 10.29), ("davinci-dev/ctx-native", davinci_dev_ctx_native_normalize_steps, 57.57), ("davinci-dev/env-native", davinci_dev_env_native_normalize_steps, 2.58), From c875434c92b7f6bb0c04218ee51dbfbb48b204f0 Mon Sep 17 00:00:00 2001 From: Helw150 Date: Fri, 15 May 2026 12:39:17 -0700 Subject: [PATCH 2/3] Add ClimbLab-Ja tokenize step and update token count to 371.92B --- .../pretraining_datasets/climblab_ja.py | 39 +++++++++++++++++++ lib/marin/src/marin/datakit/sources.py | 2 +- 2 files changed, 40 insertions(+), 1 deletion(-) create mode 100644 experiments/pretraining_datasets/climblab_ja.py diff --git a/experiments/pretraining_datasets/climblab_ja.py b/experiments/pretraining_datasets/climblab_ja.py new file mode 100644 index 0000000000..76840ff731 --- /dev/null +++ b/experiments/pretraining_datasets/climblab_ja.py @@ -0,0 +1,39 @@ +# Copyright The Marin Authors +# SPDX-License-Identifier: Apache-2.0 + +"""ClimbLab-Ja pre-training dataset tokenization. + +Download/normalize definitions live in marin.datakit.download.climblab_ja. +This file wires the normalized output into a tokenize step for experiment +pipelines. +""" + +import os.path + +from experiments.marin_models import marin_tokenizer +from marin.datakit.download.climblab_ja import climblab_ja_normalize_steps +from marin.execution.executor import ExecutorStep, executor_main, this_output_path, versioned +from marin.processing.tokenize import TokenizeConfig, tokenize +from marin.processing.tokenize.data_configs import TokenizerStep + +_download_spec, _normalize_spec = climblab_ja_normalize_steps() + +download: ExecutorStep = _download_spec.as_executor_step() +normalized: ExecutorStep = _normalize_spec.as_executor_step() + + +def tokenize_climblab_ja(*, tokenizer: str = marin_tokenizer) -> TokenizerStep: + return ExecutorStep( + name=os.path.join("tokenized", "climblab-ja"), + fn=tokenize, + config=TokenizeConfig( + train_paths=[normalized / "outputs/main/*.parquet"], + validation_paths=versioned([]), + cache_path=this_output_path(), + tokenizer=versioned(tokenizer), + ), + ) + + +if __name__ == "__main__": + executor_main(steps=[tokenize_climblab_ja()]) diff --git a/lib/marin/src/marin/datakit/sources.py b/lib/marin/src/marin/datakit/sources.py index 15ed99be7f..e121b5844c 100644 --- a/lib/marin/src/marin/datakit/sources.py +++ b/lib/marin/src/marin/datakit/sources.py @@ -145,7 +145,7 @@ def all_sources() -> dict[str, DatakitSource]: # cp/biodiversity is carved out of common_pile (see common_pile.py) # because it needs page-stitching before normalize. ("cp/biodiversity", biodiversity_normalize_steps, 8.60), - ("climblab-ja", climblab_ja_normalize_steps, 300.0), + ("climblab-ja", climblab_ja_normalize_steps, 371.92), ("coderforge", coderforge_normalize_steps, 10.29), ("davinci-dev/ctx-native", davinci_dev_ctx_native_normalize_steps, 57.57), ("davinci-dev/env-native", davinci_dev_env_native_normalize_steps, 2.58), From efb23e577728920f5024c4d00bc5dc03eb35be08 Mon Sep 17 00:00:00 2001 From: Helw150 Date: Fri, 15 May 2026 12:54:58 -0700 Subject: [PATCH 3/3] ruff: reorder imports in climblab_ja tokenize module --- experiments/pretraining_datasets/climblab_ja.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/experiments/pretraining_datasets/climblab_ja.py b/experiments/pretraining_datasets/climblab_ja.py index 76840ff731..9872a028d0 100644 --- a/experiments/pretraining_datasets/climblab_ja.py +++ b/experiments/pretraining_datasets/climblab_ja.py @@ -10,12 +10,13 @@ import os.path -from experiments.marin_models import marin_tokenizer from marin.datakit.download.climblab_ja import climblab_ja_normalize_steps from marin.execution.executor import ExecutorStep, executor_main, this_output_path, versioned from marin.processing.tokenize import TokenizeConfig, tokenize from marin.processing.tokenize.data_configs import TokenizerStep +from experiments.marin_models import marin_tokenizer + _download_spec, _normalize_spec = climblab_ja_normalize_steps() download: ExecutorStep = _download_spec.as_executor_step()