Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 40 additions & 0 deletions experiments/pretraining_datasets/climblab_ja.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# Copyright The Marin Authors
# SPDX-License-Identifier: Apache-2.0

"""ClimbLab-Ja pre-training dataset tokenization.

Download/normalize definitions live in marin.datakit.download.climblab_ja.
This file wires the normalized output into a tokenize step for experiment
pipelines.
"""

import os.path

from marin.datakit.download.climblab_ja import climblab_ja_normalize_steps
from marin.execution.executor import ExecutorStep, executor_main, this_output_path, versioned
from marin.processing.tokenize import TokenizeConfig, tokenize
from marin.processing.tokenize.data_configs import TokenizerStep

from experiments.marin_models import marin_tokenizer

_download_spec, _normalize_spec = climblab_ja_normalize_steps()

download: ExecutorStep = _download_spec.as_executor_step()
normalized: ExecutorStep = _normalize_spec.as_executor_step()


def tokenize_climblab_ja(*, tokenizer: str = marin_tokenizer) -> TokenizerStep:
return ExecutorStep(
name=os.path.join("tokenized", "climblab-ja"),
fn=tokenize,
config=TokenizeConfig(
train_paths=[normalized / "outputs/main/*.parquet"],
validation_paths=versioned([]),
cache_path=this_output_path(),
tokenizer=versioned(tokenizer),
),
)


if __name__ == "__main__":
executor_main(steps=[tokenize_climblab_ja()])
34 changes: 34 additions & 0 deletions lib/marin/src/marin/datakit/download/climblab_ja.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# Copyright The Marin Authors
# SPDX-License-Identifier: Apache-2.0

"""KantaHayashiAI/ClimbLab-Ja dataset download + normalize helpers.

~300B Japanese tokens (480 GB parquet, 201M rows) derived from LLM-jp
Corpus v4 with semantic re-clustering — a Japanese adaptation of
NVIDIA's Nemotron-ClimbLab quality-filtering pipeline. Per-document
fields ``quality``, ``advertisement``, and the four ``*_value`` scores
are preserved through normalize so downstream consumers can re-filter
without re-deriving them. License: ODC-BY.
"""

from marin.datakit.download.huggingface import download_hf_step
from marin.datakit.normalize import normalize_step
from marin.execution.step_spec import StepSpec

HF_DATASET_ID = "KantaHayashiAI/ClimbLab-Ja"
HF_REVISION = "889e349"


def climblab_ja_normalize_steps() -> tuple[StepSpec, ...]:
"""Return the ``(download, normalize)`` chain for ClimbLab-Ja."""
download = download_hf_step(
"raw/climblab-ja",
hf_dataset_id=HF_DATASET_ID,
revision=HF_REVISION,
)
normalize = normalize_step(
name="normalized/climblab-ja",
download=download,
file_extensions=(".parquet",),
)
return (download, normalize)
2 changes: 2 additions & 0 deletions lib/marin/src/marin/datakit/sources.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

from marin.datakit.canonical.safety_pretraining import safety_pretraining_normalize_steps
from marin.datakit.download.biodiversity import biodiversity_normalize_steps
from marin.datakit.download.climblab_ja import climblab_ja_normalize_steps
from marin.datakit.download.coderforge import coderforge_normalize_steps
from marin.datakit.download.common_pile import common_pile_normalize_steps
from marin.datakit.download.davinci_dev import (
Expand Down Expand Up @@ -145,6 +146,7 @@ def all_sources() -> dict[str, DatakitSource]:
# cp/biodiversity is carved out of common_pile (see common_pile.py)
# because it needs page-stitching before normalize.
("cp/biodiversity", biodiversity_normalize_steps, 8.60),
("climblab-ja", climblab_ja_normalize_steps, 371.92),
("coderforge", coderforge_normalize_steps, 10.29),
("davinci-dev/ctx-native", davinci_dev_ctx_native_normalize_steps, 57.57),
("davinci-dev/env-native", davinci_dev_env_native_normalize_steps, 2.58),
Expand Down
Loading