Skip to content

Commit 566d4d9

Browse files
ravwojdylaclaude
andcommitted
Remove unused config dataclasses from download functions
Removes NemotronIngressConfig, DCLMHQDownloadConfig, and TransferConfig. The underlying functions (download_nemotron_cc, extract_dclm_hq_dump, transfer_files) now take plain parameters directly. Updates tests and nemotron.py experiment to use the flat-param API or *_step() functions. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 710fe9b commit 566d4d9

9 files changed

Lines changed: 12 additions & 71 deletions

File tree

experiments/pretraining_datasets/nemotron.py

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -8,20 +8,14 @@
88

99
from experiments.defaults import DEFAULT_NEW_RUN_DATA_SHUFFLE
1010
from experiments.pretraining_datasets.dclm import dclm_components_llama3
11-
from marin.datakit.download.nemotron_cc import NemotronIngressConfig, download_nemotron_cc
11+
from marin.datakit.download.nemotron_cc import nemotron_cc_step
1212
from marin.execution.executor import ExecutorStep, output_path_of, this_output_path, versioned
1313
from marin.processing.tokenize import TokenizeConfig, lm_mixture_data_config, tokenize
1414
from marin.processing.tokenize.data_configs import TokenizerStep
1515

1616
# Raw dataset download step
1717
downloads = {
18-
"nemotron_cc": ExecutorStep(
19-
name="raw/nemotro-cc",
20-
fn=download_nemotron_cc,
21-
config=NemotronIngressConfig(
22-
output_path=this_output_path(),
23-
),
24-
)
18+
"nemotron_cc": nemotron_cc_step("raw/nemotro-cc").as_executor_step(),
2519
}
2620

2721
_nemotron_cc_path = output_path_of(downloads["nemotron_cc"], "contrib/Nemotron/Nemotron-CC/data-jsonl/")

lib/marin/src/marin/datakit/download/dclm_hq.py

Lines changed: 2 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -34,12 +34,6 @@
3434
logger = logging.getLogger(__name__)
3535

3636

37-
@dataclass
38-
class DCLMHQDownloadConfig:
39-
input_path: str
40-
output_path: str
41-
42-
4337
@dataclass
4438
class FileTask:
4539
"""Represents a single file processing task."""
@@ -172,21 +166,8 @@ def process_file(task: FileTask) -> None:
172166
raise
173167

174168

175-
def extract_dclm_hq_dump(input_path_or_cfg: str | DCLMHQDownloadConfig, output_path: str | None = None) -> None:
176-
"""Process the DCLM HQ dump and enrich with HTML from Common Crawl.
177-
178-
Args:
179-
input_path_or_cfg: Input directory path, or a DCLMHQDownloadConfig for backward compat.
180-
output_path: Output directory path. Required when input_path_or_cfg is a string.
181-
"""
182-
if isinstance(input_path_or_cfg, DCLMHQDownloadConfig):
183-
input_path = input_path_or_cfg.input_path
184-
output_path = input_path_or_cfg.output_path
185-
else:
186-
input_path = input_path_or_cfg
187-
if output_path is None:
188-
raise ValueError("output_path is required when input_path_or_cfg is a string")
189-
169+
def extract_dclm_hq_dump(input_path: str, output_path: str) -> None:
170+
"""Process the DCLM HQ dump and enrich with HTML from Common Crawl."""
190171
logger.info(f"Starting processing of DCLM HQ dump in {input_path}")
191172

192173
all_files = []

lib/marin/src/marin/datakit/download/filesystem.py

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
import os
66
import random
77
import time
8-
from dataclasses import dataclass
98

109
from iris.marin_fs import url_to_fs
1110
from marin.execution.step_spec import StepSpec
@@ -16,16 +15,6 @@
1615
logger = logging.getLogger(__name__)
1716

1817

19-
@dataclass
20-
class TransferConfig:
21-
"""Kept for backward compatibility. Prefer ``transfer_files()`` with flat params."""
22-
23-
input_path: str
24-
output_path: str
25-
num_random_files: int | None = None
26-
filetype: str = "jsonl.zst"
27-
28-
2918
def transfer_files(
3019
input_path: str,
3120
output_path: str,

lib/marin/src/marin/datakit/download/nemotron_cc.py

Lines changed: 2 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -14,12 +14,10 @@
1414
import logging
1515
import os
1616
from collections.abc import Iterator
17-
from dataclasses import dataclass
1817

1918
import requests
2019
import zstandard
2120
from iris.marin_fs import open_url
22-
from marin.execution import THIS_OUTPUT_PATH
2321
from marin.execution.step_spec import StepSpec
2422
from marin.utils import fsspec_exists
2523
from requests.adapters import HTTPAdapter
@@ -85,22 +83,8 @@ def download_single_nemotron_path(input_file_path: str, output_file_path: str) -
8583
return {"input_file": input_file_path, "output_file": output_file_path, "num_records": num_records}
8684

8785

88-
@dataclass
89-
class NemotronIngressConfig:
90-
"""Kept for backward compatibility with ExecutorStep callers."""
91-
92-
output_path: str = THIS_OUTPUT_PATH
93-
94-
95-
def download_nemotron_cc(output_path_or_cfg: str | NemotronIngressConfig) -> None:
96-
"""Download and process Nemotron-CC dataset from Common Crawl.
97-
98-
Args:
99-
output_path_or_cfg: Output directory path, or a NemotronIngressConfig for backward compat.
100-
"""
101-
output_path = (
102-
output_path_or_cfg.output_path if isinstance(output_path_or_cfg, NemotronIngressConfig) else output_path_or_cfg
103-
)
86+
def download_nemotron_cc(output_path: str) -> None:
87+
"""Download and process Nemotron-CC dataset from Common Crawl."""
10488

10589
paths_file_path = os.path.join(output_path, "data-jsonl.paths")
10690
logger.info(f"Downloading Nemotron CC path file {paths_file_path}")

lib/marin/src/marin/download/dclm_hq/download_dclm_hq_html.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22
# SPDX-License-Identifier: Apache-2.0
33
# Backward-compat shim. Canonical location: marin.datakit.download.dclm_hq
44

5-
from marin.datakit.download.dclm_hq import DCLMHQDownloadConfig as DCLMHQDownloadConfig
65
from marin.datakit.download.dclm_hq import FileTask as FileTask
76
from marin.datakit.download.dclm_hq import extract_dclm_hq_dump as extract_dclm_hq_dump
87
from marin.datakit.download.dclm_hq import fetch_warc_from_cc as fetch_warc_from_cc

lib/marin/src/marin/download/filesystem/transfer.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,5 +2,4 @@
22
# SPDX-License-Identifier: Apache-2.0
33
# Backward-compat shim. Canonical location: marin.datakit.download.filesystem
44

5-
from marin.datakit.download.filesystem import TransferConfig as TransferConfig
65
from marin.datakit.download.filesystem import transfer_files as transfer_files

lib/marin/src/marin/download/nemotron_cc/download_nemotron_cc.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,5 @@
22
# SPDX-License-Identifier: Apache-2.0
33
# Backward-compat shim. Canonical location: marin.datakit.download.nemotron_cc
44

5-
from marin.datakit.download.nemotron_cc import NemotronIngressConfig as NemotronIngressConfig
65
from marin.datakit.download.nemotron_cc import download_nemotron_cc as download_nemotron_cc
76
from marin.datakit.download.nemotron_cc import download_single_nemotron_path as download_single_nemotron_path

tests/download/test_dclm_hq.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
from unittest.mock import patch
88

99
import zstandard as zstd
10-
from marin.datakit.download.dclm_hq import DCLMHQDownloadConfig, extract_dclm_hq_dump
10+
from marin.datakit.download.dclm_hq import extract_dclm_hq_dump
1111

1212
SAMPLE_DCLM_RECORDS = [
1313
{
@@ -171,8 +171,7 @@ def mock_requests_get(url, **kwargs):
171171
raise ValueError(f"Unexpected URL: {url}")
172172

173173
with patch("marin.datakit.download.dclm_hq.requests.get", side_effect=mock_requests_get):
174-
cfg = DCLMHQDownloadConfig(input_path=str(tmp_path / "input"), output_path=str(output_dir))
175-
extract_dclm_hq_dump(cfg)
174+
extract_dclm_hq_dump(str(tmp_path / "input"), str(output_dir))
176175

177176
# Verify output files were created in nested structure
178177
shard1_output = output_dir / "shard1"

tests/download/test_nemotron_cc.py

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
import pytest
1010
import zstandard as zstd
1111
from iris.marin_fs import open_url as _real_open_url
12-
from marin.datakit.download.nemotron_cc import NemotronIngressConfig, download_nemotron_cc
12+
from marin.datakit.download.nemotron_cc import download_nemotron_cc
1313

1414
_OPEN_URL_TARGET = "marin.datakit.download.nemotron_cc.open_url"
1515
_REQUESTS_SESSION_TARGET = "marin.datakit.download.nemotron_cc.requests.Session"
@@ -114,8 +114,7 @@ def test_download_nemotron_cc_pipeline(tmp_path, mock_paths_open):
114114
patch(_OPEN_URL_TARGET, side_effect=mock_paths_open(paths)),
115115
patch(_REQUESTS_SESSION_TARGET, _mock_session_for({"file1": file1_data, "file2": file2_data})),
116116
):
117-
cfg = NemotronIngressConfig(output_path=str(output_dir))
118-
download_nemotron_cc(cfg)
117+
download_nemotron_cc(str(output_dir))
119118

120119
all_records = read_all_jsonl_zst(output_dir / "contrib" / "Nemotron")
121120

@@ -152,8 +151,7 @@ def test_download_nemotron_cc_dolma_format(tmp_path, mock_paths_open):
152151
patch(_OPEN_URL_TARGET, side_effect=mock_paths_open(paths)),
153152
patch(_REQUESTS_SESSION_TARGET, _mock_session_for({"test": compressed_data})),
154153
):
155-
cfg = NemotronIngressConfig(output_path=str(output_dir))
156-
download_nemotron_cc(cfg)
154+
download_nemotron_cc(str(output_dir))
157155

158156
records = read_all_jsonl_zst(output_dir / "contrib" / "Nemotron")
159157
assert len(records) == 1
@@ -188,8 +186,7 @@ def test_download_nemotron_cc_skips_existing(tmp_path, mock_paths_open):
188186
patch(_OPEN_URL_TARGET, side_effect=mock_paths_open(paths)),
189187
patch(_REQUESTS_SESSION_TARGET) as mock_session,
190188
):
191-
cfg = NemotronIngressConfig(output_path=str(output_dir))
192-
download_nemotron_cc(cfg)
189+
download_nemotron_cc(str(output_dir))
193190

194191
mock_session.return_value.get.assert_not_called()
195192
assert existing_output.read_text() == "existing"

0 commit comments

Comments
 (0)