Skip to content

Commit e0d23b3

Browse files
claude[bot]github-actions[bot]Helw150
authored
Add KantaHayashiAI/ClimbLab-Ja as a Datakit source (#5741)
Registers climblab-ja (~300B Japanese tokens, 201M rows / 480 GB parquet) as a single-source datakit entry. Derived from LLM-jp Corpus v4 with Nemotron-ClimbLab style semantic clustering and per-document quality / value scores; those columns pass through normalize so downstream consumers can re-filter without re-deriving them. License: ODC-BY. Fixes #5740 --------- Co-authored-by: claude[bot] <41898282+claude[bot]@users.noreply.github.com> Co-authored-by: Helw150 <wbh230@nyu.edu> Co-authored-by: William Held <will.held@openathena.ai>
1 parent 0c0a3ae commit e0d23b3

3 files changed

Lines changed: 76 additions & 0 deletions

File tree

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
# Copyright The Marin Authors
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
"""ClimbLab-Ja pre-training dataset tokenization.
5+
6+
Download/normalize definitions live in marin.datakit.download.climblab_ja.
7+
This file wires the normalized output into a tokenize step for experiment
8+
pipelines.
9+
"""
10+
11+
import os.path
12+
13+
from marin.datakit.download.climblab_ja import climblab_ja_normalize_steps
14+
from marin.execution.executor import ExecutorStep, executor_main, this_output_path, versioned
15+
from marin.processing.tokenize import TokenizeConfig, tokenize
16+
from marin.processing.tokenize.data_configs import TokenizerStep
17+
18+
from experiments.marin_models import marin_tokenizer
19+
20+
_download_spec, _normalize_spec = climblab_ja_normalize_steps()
21+
22+
download: ExecutorStep = _download_spec.as_executor_step()
23+
normalized: ExecutorStep = _normalize_spec.as_executor_step()
24+
25+
26+
def tokenize_climblab_ja(*, tokenizer: str = marin_tokenizer) -> TokenizerStep:
27+
return ExecutorStep(
28+
name=os.path.join("tokenized", "climblab-ja"),
29+
fn=tokenize,
30+
config=TokenizeConfig(
31+
train_paths=[normalized / "outputs/main/*.parquet"],
32+
validation_paths=versioned([]),
33+
cache_path=this_output_path(),
34+
tokenizer=versioned(tokenizer),
35+
),
36+
)
37+
38+
39+
if __name__ == "__main__":
40+
executor_main(steps=[tokenize_climblab_ja()])
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
# Copyright The Marin Authors
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
"""KantaHayashiAI/ClimbLab-Ja dataset download + normalize helpers.
5+
6+
~300B Japanese tokens (480 GB parquet, 201M rows) derived from LLM-jp
7+
Corpus v4 with semantic re-clustering — a Japanese adaptation of
8+
NVIDIA's Nemotron-ClimbLab quality-filtering pipeline. Per-document
9+
fields ``quality``, ``advertisement``, and the four ``*_value`` scores
10+
are preserved through normalize so downstream consumers can re-filter
11+
without re-deriving them. License: ODC-BY.
12+
"""
13+
14+
from marin.datakit.download.huggingface import download_hf_step
15+
from marin.datakit.normalize import normalize_step
16+
from marin.execution.step_spec import StepSpec
17+
18+
HF_DATASET_ID = "KantaHayashiAI/ClimbLab-Ja"
19+
HF_REVISION = "889e349"
20+
21+
22+
def climblab_ja_normalize_steps() -> tuple[StepSpec, ...]:
23+
"""Return the ``(download, normalize)`` chain for ClimbLab-Ja."""
24+
download = download_hf_step(
25+
"raw/climblab-ja",
26+
hf_dataset_id=HF_DATASET_ID,
27+
revision=HF_REVISION,
28+
)
29+
normalize = normalize_step(
30+
name="normalized/climblab-ja",
31+
download=download,
32+
file_extensions=(".parquet",),
33+
)
34+
return (download, normalize)

lib/marin/src/marin/datakit/sources.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919

2020
from marin.datakit.canonical.safety_pretraining import safety_pretraining_normalize_steps
2121
from marin.datakit.download.biodiversity import biodiversity_normalize_steps
22+
from marin.datakit.download.climblab_ja import climblab_ja_normalize_steps
2223
from marin.datakit.download.coderforge import coderforge_normalize_steps
2324
from marin.datakit.download.common_pile import common_pile_normalize_steps
2425
from marin.datakit.download.davinci_dev import (
@@ -145,6 +146,7 @@ def all_sources() -> dict[str, DatakitSource]:
145146
# cp/biodiversity is carved out of common_pile (see common_pile.py)
146147
# because it needs page-stitching before normalize.
147148
("cp/biodiversity", biodiversity_normalize_steps, 8.60),
149+
("climblab-ja", climblab_ja_normalize_steps, 371.92),
148150
("coderforge", coderforge_normalize_steps, 10.29),
149151
("davinci-dev/ctx-native", davinci_dev_ctx_native_normalize_steps, 57.57),
150152
("davinci-dev/env-native", davinci_dev_env_native_normalize_steps, 2.58),

0 commit comments

Comments
 (0)