Add KantaHayashiAI/ClimbLab-Ja as a Datakit source (#5741)

claude[bot] · github-actions[bot] · Helw150 · web-flow · commit e0d23b34d7de · 2026-05-15T13:39:29.000-07:00
Registers climblab-ja (~300B Japanese tokens, 201M rows / 480 GB parquet) as a single-source datakit entry. Derived from LLM-jp Corpus v4 with Nemotron-ClimbLab style semantic clustering and per-document quality / value scores; those columns pass through normalize so downstream consumers can re-filter without re-deriving them. License: ODC-BY. Fixes #5740 --------- Co-authored-by: claude[bot] <41898282+claude[bot]@users.noreply.github.com> Co-authored-by: Helw150 <wbh230@nyu.edu> Co-authored-by: William Held <will.held@openathena.ai>
diff --git a/experiments/pretraining_datasets/climblab_ja.py b/experiments/pretraining_datasets/climblab_ja.py
@@ -0,0 +1,40 @@
+# Copyright The Marin Authors
+# SPDX-License-Identifier: Apache-2.0
+
+"""ClimbLab-Ja pre-training dataset tokenization.
+
+Download/normalize definitions live in marin.datakit.download.climblab_ja.
+This file wires the normalized output into a tokenize step for experiment
+pipelines.
+"""
+
+import os.path
+
+from marin.datakit.download.climblab_ja import climblab_ja_normalize_steps
+from marin.execution.executor import ExecutorStep, executor_main, this_output_path, versioned
+from marin.processing.tokenize import TokenizeConfig, tokenize
+from marin.processing.tokenize.data_configs import TokenizerStep
+
+from experiments.marin_models import marin_tokenizer
+
+_download_spec, _normalize_spec = climblab_ja_normalize_steps()
+
+download: ExecutorStep = _download_spec.as_executor_step()
+normalized: ExecutorStep = _normalize_spec.as_executor_step()
+
+
+def tokenize_climblab_ja(*, tokenizer: str = marin_tokenizer) -> TokenizerStep:
+    return ExecutorStep(
+        name=os.path.join("tokenized", "climblab-ja"),
+        fn=tokenize,
+        config=TokenizeConfig(
+            train_paths=[normalized / "outputs/main/*.parquet"],
+            validation_paths=versioned([]),
+            cache_path=this_output_path(),
+            tokenizer=versioned(tokenizer),
+        ),
+    )
+
+
+if __name__ == "__main__":
+    executor_main(steps=[tokenize_climblab_ja()])
diff --git a/lib/marin/src/marin/datakit/download/climblab_ja.py b/lib/marin/src/marin/datakit/download/climblab_ja.py
@@ -0,0 +1,34 @@
+# Copyright The Marin Authors
+# SPDX-License-Identifier: Apache-2.0
+
+"""KantaHayashiAI/ClimbLab-Ja dataset download + normalize helpers.
+
+~300B Japanese tokens (480 GB parquet, 201M rows) derived from LLM-jp
+Corpus v4 with semantic re-clustering — a Japanese adaptation of
+NVIDIA's Nemotron-ClimbLab quality-filtering pipeline. Per-document
+fields ``quality``, ``advertisement``, and the four ``*_value`` scores
+are preserved through normalize so downstream consumers can re-filter
+without re-deriving them. License: ODC-BY.
+"""
+
+from marin.datakit.download.huggingface import download_hf_step
+from marin.datakit.normalize import normalize_step
+from marin.execution.step_spec import StepSpec
+
+HF_DATASET_ID = "KantaHayashiAI/ClimbLab-Ja"
+HF_REVISION = "889e349"
+
+
+def climblab_ja_normalize_steps() -> tuple[StepSpec, ...]:
+    """Return the ``(download, normalize)`` chain for ClimbLab-Ja."""
+    download = download_hf_step(
+        "raw/climblab-ja",
+        hf_dataset_id=HF_DATASET_ID,
+        revision=HF_REVISION,
+    )
+    normalize = normalize_step(
+        name="normalized/climblab-ja",
+        download=download,
+        file_extensions=(".parquet",),
+    )
+    return (download, normalize)
diff --git a/lib/marin/src/marin/datakit/sources.py b/lib/marin/src/marin/datakit/sources.py
@@ -19,6 +19,7 @@
 
 from marin.datakit.canonical.safety_pretraining import safety_pretraining_normalize_steps
 from marin.datakit.download.biodiversity import biodiversity_normalize_steps
+from marin.datakit.download.climblab_ja import climblab_ja_normalize_steps
 from marin.datakit.download.coderforge import coderforge_normalize_steps
 from marin.datakit.download.common_pile import common_pile_normalize_steps
 from marin.datakit.download.davinci_dev import (
@@ -145,6 +146,7 @@ def all_sources() -> dict[str, DatakitSource]:
         # cp/biodiversity is carved out of common_pile (see common_pile.py)
         # because it needs page-stitching before normalize.
         ("cp/biodiversity", biodiversity_normalize_steps, 8.60),
+        ("climblab-ja", climblab_ja_normalize_steps, 371.92),
         ("coderforge", coderforge_normalize_steps, 10.29),
         ("davinci-dev/ctx-native", davinci_dev_ctx_native_normalize_steps, 57.57),
         ("davinci-dev/env-native", davinci_dev_env_native_normalize_steps, 2.58),