Skip to content

Commit a90123e

Browse files
ravwojdylaclaude
andcommitted
Extract nemotron_v2 download definitions into datakit/download/nemotron_v2.py
Moves NEMOTRON_V2_DATASETS and nemotron_v2_download_step() from experiments/pretraining_datasets/nemotron_v2.py into a datakit module. Replaces the raw dict with a NemotronV2Dataset dataclass. The experiment file now imports definitions and only wires tokenization. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 8304b15 commit a90123e

3 files changed

Lines changed: 131 additions & 117 deletions

File tree

experiments/pretraining_datasets/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -130,7 +130,7 @@
130130
# Nemotron v2 datasets (from nvidia/Nemotron-Pre-Training-Datasets collection)
131131
**{
132132
family: {
133-
"subsets": list(info["subsets"].keys()),
133+
"subsets": list(info.subsets.keys()),
134134
"download": nemotron_v2_downloads[family],
135135
"tokenize_fn": lambda f=family: tokenize_nemotron_v2_family(f),
136136
}

experiments/pretraining_datasets/nemotron_v2.py

Lines changed: 8 additions & 116 deletions
Original file line numberDiff line numberDiff line change
@@ -2,134 +2,26 @@
22
# SPDX-License-Identifier: Apache-2.0
33

44
"""
5-
Nemotron v2 pre-training dataset definitions and tokenization.
5+
Nemotron v2 pre-training dataset tokenization.
66
7-
These datasets come from the nvidia/Nemotron-Pre-Training-Datasets collection
8-
on HuggingFace. They are additive to the original Nemotron-CC (v1) dataset
9-
defined in nemotron.py.
10-
11-
Most of these datasets are gated and require HF_TOKEN at download time.
12-
All use parquet format with a "text" field.
7+
Download definitions live in marin.datakit.download.nemotron_v2.
8+
This file wires them into tokenization steps for experiment pipelines.
139
"""
1410

1511
import os.path
1612

17-
from marin.datakit.download.huggingface import DownloadConfig, download_hf
13+
from marin.datakit.download.nemotron_v2 import NEMOTRON_V2_DATASETS, nemotron_v2_download_step
1814
from marin.execution.executor import ExecutorStep, this_output_path, versioned
1915
from marin.processing.tokenize import TokenizeConfig, tokenize
2016
from marin.processing.tokenize.data_configs import TokenizerStep
2117

22-
# ============================================================================
23-
# DATASET DEFINITIONS
24-
# ============================================================================
25-
26-
# Each entry: (hf_id, revision, subsets_dict)
27-
# subsets_dict maps subset_name -> glob pattern for parquet files within the download
28-
29-
NEMOTRON_V2_DATASETS = {
30-
"nemotron_cc_v2": {
31-
"hf_dataset_id": "nvidia/Nemotron-CC-v2",
32-
"revision": "229a2e7",
33-
"subsets": {
34-
"diverse_qa": "Diverse-QA/**/*.parquet",
35-
"high_quality": "High-Quality/**/*.parquet",
36-
"high_quality_synthetic": "High-Quality-Synthetic/**/*.parquet",
37-
"medium_high_quality": "Medium-High-Quality/**/*.parquet",
38-
"medium_quality": "Medium-Quality/**/*.parquet",
39-
"translated_diverse_qa": "Translated-Diverse-QA/**/*.parquet",
40-
},
41-
},
42-
"nemotron_cc_v2_1": {
43-
"hf_dataset_id": "nvidia/Nemotron-CC-v2.1",
44-
"revision": "ba6f2aa",
45-
"subsets": {
46-
"high_quality": "High-Quality/**/*.parquet",
47-
"high_quality_dqa": "High-Quality-DQA/**/*.parquet",
48-
"high_quality_synthetic": "High-Quality-Synthetic/**/*.parquet",
49-
"high_quality_translated": "High-Quality-Translated-To-English/**/*.parquet",
50-
"high_quality_translated_synthetic": "High-Quality-Translated-To-English-Synthetic/**/*.parquet",
51-
"medium_high_quality": "Medium-High-Quality/**/*.parquet",
52-
"medium_high_quality_synthetic": "Medium-High-Quality-Synthetic/**/*.parquet",
53-
"medium_high_quality_translated": "Medium-High-Quality-Translated-To-English/**/*.parquet",
54-
"medium_quality": "Medium-Quality/**/*.parquet",
55-
},
56-
},
57-
"nemotron_cc_code_v1": {
58-
"hf_dataset_id": "nvidia/Nemotron-CC-Code-v1",
59-
"revision": "5c5bebc",
60-
"subsets": {
61-
"all": "data/**/*.parquet",
62-
},
63-
},
64-
"nemotron_cc_math_v1": {
65-
"hf_dataset_id": "nvidia/Nemotron-CC-Math-v1",
66-
"revision": "397a250",
67-
"subsets": {
68-
"3": "3/**/*.parquet",
69-
"4plus": "4plus/**/*.parquet",
70-
"4plus_mind": "4plus_MIND/**/*.parquet",
71-
},
72-
},
73-
"nemotron_pretraining_code_v1": {
74-
"hf_dataset_id": "nvidia/Nemotron-Pretraining-Code-v1",
75-
"revision": "01393d3",
76-
"subsets": {
77-
"synthetic_code": "Synthetic-Code/**/*.parquet",
78-
"code_metadata": "Nemotron-Code-Metadata/**/*.parquet",
79-
},
80-
},
81-
"nemotron_pretraining_code_v2": {
82-
"hf_dataset_id": "nvidia/Nemotron-Pretraining-Code-v2",
83-
"revision": "7b1a453",
84-
"subsets": {
85-
"code_metadata": "Nemotron-Code-Metadata/**/*.parquet",
86-
"synthetic_question_answering": "Synthetic-Question-Answering/**/*.parquet",
87-
"synthetic_student_teacher": "Synthetic-Student-Teacher/**/*.parquet",
88-
"synthetic_code_review": "Synthetic-Code-Review/**/*.parquet",
89-
"synthetic_rewriting": "Synthetic-Rewriting/**/*.parquet",
90-
"synthetic_transpilation": "Synthetic-Transpilation/**/*.parquet",
91-
},
92-
},
93-
"nemotron_pretraining_specialized_v1": {
94-
"hf_dataset_id": "nvidia/Nemotron-Pretraining-Specialized-v1",
95-
"revision": "9ed3718",
96-
"subsets": {
97-
"wiki_rewrite": "Nemotron-Pretraining-Wiki-Rewrite/**/*.parquet",
98-
"math_textbooks": "Nemotron-Pretraining-Math-Textbooks/**/*.parquet",
99-
"stem_sft": "Nemotron-Pretraining-STEM-SFT/**/*.parquet",
100-
"scientific_coding": "Nemotron-Pretraining-Scientific-Coding/**/*.parquet",
101-
"rqa": "Nemotron-Pretraining-RQA/**/*.parquet",
102-
"infinibyte_reasoning": "Nemotron-Pretraining-InfiniByte-Reasoning/**/*.parquet",
103-
},
104-
},
105-
"nemotron_pretraining_sft_v1": {
106-
"hf_dataset_id": "nvidia/Nemotron-Pretraining-SFT-v1",
107-
"revision": "3f1a5b8",
108-
"subsets": {
109-
"sft_code": "Nemotron-SFT-Code/**/*.parquet",
110-
"sft_general": "Nemotron-SFT-General/**/*.parquet",
111-
"sft_math": "Nemotron-SFT-MATH/**/*.parquet",
112-
},
113-
},
114-
}
115-
116-
11718
# ============================================================================
11819
# RAW DATASET DOWNLOADS
11920
# ============================================================================
12021

121-
downloads: dict[str, ExecutorStep] = {}
122-
for _family, _info in NEMOTRON_V2_DATASETS.items():
123-
downloads[_family] = ExecutorStep(
124-
name=f"raw/{_family}",
125-
fn=download_hf,
126-
config=DownloadConfig(
127-
hf_dataset_id=_info["hf_dataset_id"],
128-
revision=versioned(_info["revision"]),
129-
gcs_output_path=this_output_path(),
130-
wait_for_completion=True,
131-
),
132-
)
22+
downloads: dict[str, ExecutorStep] = {
23+
family: nemotron_v2_download_step(family).as_executor_step() for family in NEMOTRON_V2_DATASETS
24+
}
13325

13426

13527
# ============================================================================
@@ -152,7 +44,7 @@ def tokenize_nemotron_v2_family(
15244
download_step = downloads[family]
15345

15446
steps: dict[str, ExecutorStep[TokenizeConfig]] = {}
155-
for subset, glob_pattern in info["subsets"].items():
47+
for subset, glob_pattern in info.subsets.items():
15648
output_name = os.path.join("tokenized", family, subset)
15749
step = ExecutorStep(
15850
name=output_name,
Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
# Copyright The Marin Authors
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
"""Nemotron v2 pre-training dataset download definitions.
5+
6+
These datasets come from the nvidia/Nemotron-Pre-Training-Datasets collection
7+
on HuggingFace. They are additive to the original Nemotron-CC (v1) dataset.
8+
9+
Most of these datasets are gated and require HF_TOKEN at download time.
10+
All use parquet format with a "text" field.
11+
"""
12+
13+
from dataclasses import dataclass, field
14+
15+
from marin.datakit.download.huggingface import download_hf_step
16+
from marin.execution.step_spec import StepSpec
17+
18+
19+
@dataclass(frozen=True)
20+
class NemotronV2Dataset:
21+
"""Metadata for a single Nemotron v2 HuggingFace dataset."""
22+
23+
hf_dataset_id: str
24+
revision: str
25+
subsets: dict[str, str] = field(default_factory=dict)
26+
"""Maps subset_name -> glob pattern for parquet files within the download."""
27+
28+
29+
NEMOTRON_V2_DATASETS: dict[str, NemotronV2Dataset] = {
30+
"nemotron_cc_v2": NemotronV2Dataset(
31+
hf_dataset_id="nvidia/Nemotron-CC-v2",
32+
revision="229a2e7",
33+
subsets={
34+
"diverse_qa": "Diverse-QA/**/*.parquet",
35+
"high_quality": "High-Quality/**/*.parquet",
36+
"high_quality_synthetic": "High-Quality-Synthetic/**/*.parquet",
37+
"medium_high_quality": "Medium-High-Quality/**/*.parquet",
38+
"medium_quality": "Medium-Quality/**/*.parquet",
39+
"translated_diverse_qa": "Translated-Diverse-QA/**/*.parquet",
40+
},
41+
),
42+
"nemotron_cc_v2_1": NemotronV2Dataset(
43+
hf_dataset_id="nvidia/Nemotron-CC-v2.1",
44+
revision="ba6f2aa",
45+
subsets={
46+
"high_quality": "High-Quality/**/*.parquet",
47+
"high_quality_dqa": "High-Quality-DQA/**/*.parquet",
48+
"high_quality_synthetic": "High-Quality-Synthetic/**/*.parquet",
49+
"high_quality_translated": "High-Quality-Translated-To-English/**/*.parquet",
50+
"high_quality_translated_synthetic": "High-Quality-Translated-To-English-Synthetic/**/*.parquet",
51+
"medium_high_quality": "Medium-High-Quality/**/*.parquet",
52+
"medium_high_quality_synthetic": "Medium-High-Quality-Synthetic/**/*.parquet",
53+
"medium_high_quality_translated": "Medium-High-Quality-Translated-To-English/**/*.parquet",
54+
"medium_quality": "Medium-Quality/**/*.parquet",
55+
},
56+
),
57+
"nemotron_cc_code_v1": NemotronV2Dataset(
58+
hf_dataset_id="nvidia/Nemotron-CC-Code-v1",
59+
revision="5c5bebc",
60+
subsets={"all": "data/**/*.parquet"},
61+
),
62+
"nemotron_cc_math_v1": NemotronV2Dataset(
63+
hf_dataset_id="nvidia/Nemotron-CC-Math-v1",
64+
revision="397a250",
65+
subsets={
66+
"3": "3/**/*.parquet",
67+
"4plus": "4plus/**/*.parquet",
68+
"4plus_mind": "4plus_MIND/**/*.parquet",
69+
},
70+
),
71+
"nemotron_pretraining_code_v1": NemotronV2Dataset(
72+
hf_dataset_id="nvidia/Nemotron-Pretraining-Code-v1",
73+
revision="01393d3",
74+
subsets={
75+
"synthetic_code": "Synthetic-Code/**/*.parquet",
76+
"code_metadata": "Nemotron-Code-Metadata/**/*.parquet",
77+
},
78+
),
79+
"nemotron_pretraining_code_v2": NemotronV2Dataset(
80+
hf_dataset_id="nvidia/Nemotron-Pretraining-Code-v2",
81+
revision="7b1a453",
82+
subsets={
83+
"code_metadata": "Nemotron-Code-Metadata/**/*.parquet",
84+
"synthetic_question_answering": "Synthetic-Question-Answering/**/*.parquet",
85+
"synthetic_student_teacher": "Synthetic-Student-Teacher/**/*.parquet",
86+
"synthetic_code_review": "Synthetic-Code-Review/**/*.parquet",
87+
"synthetic_rewriting": "Synthetic-Rewriting/**/*.parquet",
88+
"synthetic_transpilation": "Synthetic-Transpilation/**/*.parquet",
89+
},
90+
),
91+
"nemotron_pretraining_specialized_v1": NemotronV2Dataset(
92+
hf_dataset_id="nvidia/Nemotron-Pretraining-Specialized-v1",
93+
revision="9ed3718",
94+
subsets={
95+
"wiki_rewrite": "Nemotron-Pretraining-Wiki-Rewrite/**/*.parquet",
96+
"math_textbooks": "Nemotron-Pretraining-Math-Textbooks/**/*.parquet",
97+
"stem_sft": "Nemotron-Pretraining-STEM-SFT/**/*.parquet",
98+
"scientific_coding": "Nemotron-Pretraining-Scientific-Coding/**/*.parquet",
99+
"rqa": "Nemotron-Pretraining-RQA/**/*.parquet",
100+
"infinibyte_reasoning": "Nemotron-Pretraining-InfiniByte-Reasoning/**/*.parquet",
101+
},
102+
),
103+
"nemotron_pretraining_sft_v1": NemotronV2Dataset(
104+
hf_dataset_id="nvidia/Nemotron-Pretraining-SFT-v1",
105+
revision="3f1a5b8",
106+
subsets={
107+
"sft_code": "Nemotron-SFT-Code/**/*.parquet",
108+
"sft_general": "Nemotron-SFT-General/**/*.parquet",
109+
"sft_math": "Nemotron-SFT-MATH/**/*.parquet",
110+
},
111+
),
112+
}
113+
114+
115+
def nemotron_v2_download_step(family: str) -> StepSpec:
116+
"""Create a download StepSpec for a Nemotron v2 dataset family."""
117+
info = NEMOTRON_V2_DATASETS[family]
118+
return download_hf_step(
119+
f"raw/{family}",
120+
hf_dataset_id=info.hf_dataset_id,
121+
revision=info.revision,
122+
)

0 commit comments

Comments
 (0)