22# SPDX-License-Identifier: Apache-2.0
33
44"""
5- Nemotron v2 pre-training dataset definitions and tokenization.
5+ Nemotron v2 pre-training dataset tokenization.
66
7- These datasets come from the nvidia/Nemotron-Pre-Training-Datasets collection
8- on HuggingFace. They are additive to the original Nemotron-CC (v1) dataset
9- defined in nemotron.py.
10-
11- Most of these datasets are gated and require HF_TOKEN at download time.
12- All use parquet format with a "text" field.
7+ Download definitions live in marin.datakit.download.nemotron_v2.
8+ This file wires them into tokenization steps for experiment pipelines.
139"""
1410
1511import os .path
1612
17- from marin .datakit .download .huggingface import DownloadConfig , download_hf
13+ from marin .datakit .download .nemotron_v2 import NEMOTRON_V2_DATASETS , nemotron_v2_download_step
1814from marin .execution .executor import ExecutorStep , this_output_path , versioned
1915from marin .processing .tokenize import TokenizeConfig , tokenize
2016from marin .processing .tokenize .data_configs import TokenizerStep
2117
22- # ============================================================================
23- # DATASET DEFINITIONS
24- # ============================================================================
25-
26- # Each entry: (hf_id, revision, subsets_dict)
27- # subsets_dict maps subset_name -> glob pattern for parquet files within the download
28-
29- NEMOTRON_V2_DATASETS = {
30- "nemotron_cc_v2" : {
31- "hf_dataset_id" : "nvidia/Nemotron-CC-v2" ,
32- "revision" : "229a2e7" ,
33- "subsets" : {
34- "diverse_qa" : "Diverse-QA/**/*.parquet" ,
35- "high_quality" : "High-Quality/**/*.parquet" ,
36- "high_quality_synthetic" : "High-Quality-Synthetic/**/*.parquet" ,
37- "medium_high_quality" : "Medium-High-Quality/**/*.parquet" ,
38- "medium_quality" : "Medium-Quality/**/*.parquet" ,
39- "translated_diverse_qa" : "Translated-Diverse-QA/**/*.parquet" ,
40- },
41- },
42- "nemotron_cc_v2_1" : {
43- "hf_dataset_id" : "nvidia/Nemotron-CC-v2.1" ,
44- "revision" : "ba6f2aa" ,
45- "subsets" : {
46- "high_quality" : "High-Quality/**/*.parquet" ,
47- "high_quality_dqa" : "High-Quality-DQA/**/*.parquet" ,
48- "high_quality_synthetic" : "High-Quality-Synthetic/**/*.parquet" ,
49- "high_quality_translated" : "High-Quality-Translated-To-English/**/*.parquet" ,
50- "high_quality_translated_synthetic" : "High-Quality-Translated-To-English-Synthetic/**/*.parquet" ,
51- "medium_high_quality" : "Medium-High-Quality/**/*.parquet" ,
52- "medium_high_quality_synthetic" : "Medium-High-Quality-Synthetic/**/*.parquet" ,
53- "medium_high_quality_translated" : "Medium-High-Quality-Translated-To-English/**/*.parquet" ,
54- "medium_quality" : "Medium-Quality/**/*.parquet" ,
55- },
56- },
57- "nemotron_cc_code_v1" : {
58- "hf_dataset_id" : "nvidia/Nemotron-CC-Code-v1" ,
59- "revision" : "5c5bebc" ,
60- "subsets" : {
61- "all" : "data/**/*.parquet" ,
62- },
63- },
64- "nemotron_cc_math_v1" : {
65- "hf_dataset_id" : "nvidia/Nemotron-CC-Math-v1" ,
66- "revision" : "397a250" ,
67- "subsets" : {
68- "3" : "3/**/*.parquet" ,
69- "4plus" : "4plus/**/*.parquet" ,
70- "4plus_mind" : "4plus_MIND/**/*.parquet" ,
71- },
72- },
73- "nemotron_pretraining_code_v1" : {
74- "hf_dataset_id" : "nvidia/Nemotron-Pretraining-Code-v1" ,
75- "revision" : "01393d3" ,
76- "subsets" : {
77- "synthetic_code" : "Synthetic-Code/**/*.parquet" ,
78- "code_metadata" : "Nemotron-Code-Metadata/**/*.parquet" ,
79- },
80- },
81- "nemotron_pretraining_code_v2" : {
82- "hf_dataset_id" : "nvidia/Nemotron-Pretraining-Code-v2" ,
83- "revision" : "7b1a453" ,
84- "subsets" : {
85- "code_metadata" : "Nemotron-Code-Metadata/**/*.parquet" ,
86- "synthetic_question_answering" : "Synthetic-Question-Answering/**/*.parquet" ,
87- "synthetic_student_teacher" : "Synthetic-Student-Teacher/**/*.parquet" ,
88- "synthetic_code_review" : "Synthetic-Code-Review/**/*.parquet" ,
89- "synthetic_rewriting" : "Synthetic-Rewriting/**/*.parquet" ,
90- "synthetic_transpilation" : "Synthetic-Transpilation/**/*.parquet" ,
91- },
92- },
93- "nemotron_pretraining_specialized_v1" : {
94- "hf_dataset_id" : "nvidia/Nemotron-Pretraining-Specialized-v1" ,
95- "revision" : "9ed3718" ,
96- "subsets" : {
97- "wiki_rewrite" : "Nemotron-Pretraining-Wiki-Rewrite/**/*.parquet" ,
98- "math_textbooks" : "Nemotron-Pretraining-Math-Textbooks/**/*.parquet" ,
99- "stem_sft" : "Nemotron-Pretraining-STEM-SFT/**/*.parquet" ,
100- "scientific_coding" : "Nemotron-Pretraining-Scientific-Coding/**/*.parquet" ,
101- "rqa" : "Nemotron-Pretraining-RQA/**/*.parquet" ,
102- "infinibyte_reasoning" : "Nemotron-Pretraining-InfiniByte-Reasoning/**/*.parquet" ,
103- },
104- },
105- "nemotron_pretraining_sft_v1" : {
106- "hf_dataset_id" : "nvidia/Nemotron-Pretraining-SFT-v1" ,
107- "revision" : "3f1a5b8" ,
108- "subsets" : {
109- "sft_code" : "Nemotron-SFT-Code/**/*.parquet" ,
110- "sft_general" : "Nemotron-SFT-General/**/*.parquet" ,
111- "sft_math" : "Nemotron-SFT-MATH/**/*.parquet" ,
112- },
113- },
114- }
115-
116-
11718# ============================================================================
11819# RAW DATASET DOWNLOADS
11920# ============================================================================
12021
121- downloads : dict [str , ExecutorStep ] = {}
122- for _family , _info in NEMOTRON_V2_DATASETS .items ():
123- downloads [_family ] = ExecutorStep (
124- name = f"raw/{ _family } " ,
125- fn = download_hf ,
126- config = DownloadConfig (
127- hf_dataset_id = _info ["hf_dataset_id" ],
128- revision = versioned (_info ["revision" ]),
129- gcs_output_path = this_output_path (),
130- wait_for_completion = True ,
131- ),
132- )
22+ downloads : dict [str , ExecutorStep ] = {
23+ family : nemotron_v2_download_step (family ).as_executor_step () for family in NEMOTRON_V2_DATASETS
24+ }
13325
13426
13527# ============================================================================
@@ -152,7 +44,7 @@ def tokenize_nemotron_v2_family(
15244 download_step = downloads [family ]
15345
15446 steps : dict [str , ExecutorStep [TokenizeConfig ]] = {}
155- for subset , glob_pattern in info [ " subsets" ] .items ():
47+ for subset , glob_pattern in info . subsets .items ():
15648 output_name = os .path .join ("tokenized" , family , subset )
15749 step = ExecutorStep (
15850 name = output_name ,
0 commit comments