Skip to content

Commit 6de9362

Browse files
ravwojdylaclaude
andcommitted
Add override_output_path to nemotron_v2 datasets
Wire override_output_path through NemotronV2Dataset to download_nemotron_v2_step. Fix missing raw/ prefix on nemotron_cc_math_v1. Add overrides for code_v2, specialized_v1, and sft_v1 to pin existing output paths. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 280bc36 commit 6de9362

1 file changed

Lines changed: 5 additions & 1 deletion

File tree

lib/marin/src/marin/datakit/download/nemotron_v2.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ class NemotronV2Dataset:
7171
"4plus": "4plus/**/*.parquet",
7272
"4plus_mind": "4plus_MIND/**/*.parquet",
7373
},
74-
override_output_path="nemotron_cc_math_v1-322fe4",
74+
override_output_path="raw/nemotron_cc_math_v1-322fe4",
7575
),
7676
"nemotron_pretraining_code_v1": NemotronV2Dataset(
7777
hf_dataset_id="nvidia/Nemotron-Pretraining-Code-v1",
@@ -93,6 +93,7 @@ class NemotronV2Dataset:
9393
"synthetic_rewriting": "Synthetic-Rewriting/**/*.parquet",
9494
"synthetic_transpilation": "Synthetic-Transpilation/**/*.parquet",
9595
},
96+
override_output_path="raw/nemotron_pretraining_code_v2-d15a24",
9697
),
9798
"nemotron_pretraining_specialized_v1": NemotronV2Dataset(
9899
hf_dataset_id="nvidia/Nemotron-Pretraining-Specialized-v1",
@@ -105,6 +106,7 @@ class NemotronV2Dataset:
105106
"rqa": "Nemotron-Pretraining-RQA/**/*.parquet",
106107
"infinibyte_reasoning": "Nemotron-Pretraining-InfiniByte-Reasoning/**/*.parquet",
107108
},
109+
override_output_path="raw/nemotron_pretraining_specialized_v1-a31fae",
108110
),
109111
"nemotron_pretraining_sft_v1": NemotronV2Dataset(
110112
hf_dataset_id="nvidia/Nemotron-Pretraining-SFT-v1",
@@ -114,6 +116,7 @@ class NemotronV2Dataset:
114116
"sft_general": "Nemotron-SFT-General/**/*.parquet",
115117
"sft_math": "Nemotron-SFT-MATH/**/*.parquet",
116118
},
119+
override_output_path="raw/nemotron_pretraining_sft_v1-10f77e",
117120
),
118121
}
119122

@@ -125,4 +128,5 @@ def download_nemotron_v2_step(family: str) -> StepSpec:
125128
f"raw/{family}",
126129
hf_dataset_id=info.hf_dataset_id,
127130
revision=info.revision,
131+
override_output_path=info.override_output_path,
128132
)

0 commit comments

Comments
 (0)