|
9 | 9 | import pytest |
10 | 10 | import zstandard as zstd |
11 | 11 | from iris.marin_fs import open_url as _real_open_url |
12 | | -from marin.datakit.download.nemotron_cc import NemotronIngressConfig, download_nemotron_cc |
| 12 | +from marin.datakit.download.nemotron_cc import download_nemotron_cc |
13 | 13 |
|
14 | 14 | _OPEN_URL_TARGET = "marin.datakit.download.nemotron_cc.open_url" |
15 | 15 | _REQUESTS_SESSION_TARGET = "marin.datakit.download.nemotron_cc.requests.Session" |
@@ -114,8 +114,7 @@ def test_download_nemotron_cc_pipeline(tmp_path, mock_paths_open): |
114 | 114 | patch(_OPEN_URL_TARGET, side_effect=mock_paths_open(paths)), |
115 | 115 | patch(_REQUESTS_SESSION_TARGET, _mock_session_for({"file1": file1_data, "file2": file2_data})), |
116 | 116 | ): |
117 | | - cfg = NemotronIngressConfig(output_path=str(output_dir)) |
118 | | - download_nemotron_cc(cfg) |
| 117 | + download_nemotron_cc(str(output_dir)) |
119 | 118 |
|
120 | 119 | all_records = read_all_jsonl_zst(output_dir / "contrib" / "Nemotron") |
121 | 120 |
|
@@ -152,8 +151,7 @@ def test_download_nemotron_cc_dolma_format(tmp_path, mock_paths_open): |
152 | 151 | patch(_OPEN_URL_TARGET, side_effect=mock_paths_open(paths)), |
153 | 152 | patch(_REQUESTS_SESSION_TARGET, _mock_session_for({"test": compressed_data})), |
154 | 153 | ): |
155 | | - cfg = NemotronIngressConfig(output_path=str(output_dir)) |
156 | | - download_nemotron_cc(cfg) |
| 154 | + download_nemotron_cc(str(output_dir)) |
157 | 155 |
|
158 | 156 | records = read_all_jsonl_zst(output_dir / "contrib" / "Nemotron") |
159 | 157 | assert len(records) == 1 |
@@ -188,8 +186,7 @@ def test_download_nemotron_cc_skips_existing(tmp_path, mock_paths_open): |
188 | 186 | patch(_OPEN_URL_TARGET, side_effect=mock_paths_open(paths)), |
189 | 187 | patch(_REQUESTS_SESSION_TARGET) as mock_session, |
190 | 188 | ): |
191 | | - cfg = NemotronIngressConfig(output_path=str(output_dir)) |
192 | | - download_nemotron_cc(cfg) |
| 189 | + download_nemotron_cc(str(output_dir)) |
193 | 190 |
|
194 | 191 | mock_session.return_value.get.assert_not_called() |
195 | 192 | assert existing_output.read_text() == "existing" |
0 commit comments