|
11 | 11 | from mlcroissant._src import datasets |
12 | 12 | from mlcroissant._src.beam import ReadFromCroissant |
13 | 13 | from mlcroissant._src.core.issues import ValidationError |
14 | | -from mlcroissant._src.core.optional import deps |
15 | 14 | from mlcroissant._src.tests.records import record_to_python |
16 | 15 | from mlcroissant._src.tests.versions import parametrize_version |
17 | 16 |
|
@@ -247,74 +246,6 @@ def test_hermetic_loading_1_1(dataset_name, record_set_name, num_records, filter |
247 | 246 | ) |
248 | 247 |
|
249 | 248 |
|
250 | | -# Non-hermetic test cases (data from the internet). |
251 | | -@pytest.mark.nonhermetic |
252 | | -@parametrize_version() |
253 | | -@pytest.mark.parametrize( |
254 | | - ["dataset_name", "record_set_name", "num_records"], |
255 | | - [ |
256 | | - [ |
257 | | - "flores-200/metadata.json", |
258 | | - "language_translations_train_data_with_metadata", |
259 | | - 10, |
260 | | - ], |
261 | | - [ |
262 | | - "flores-200/metadata.json", |
263 | | - "language_translations_test_data_with_metadata", |
264 | | - 10, |
265 | | - ], |
266 | | - ["gpt-3/metadata.json", "default", 10], |
267 | | - ["huggingface-mnist/metadata.json", "default", 10], |
268 | | - ["titanic/metadata.json", "passengers", -1], |
269 | | - ], |
270 | | -) |
271 | | -def test_nonhermetic_loading(version, dataset_name, record_set_name, num_records): |
272 | | - load_records_and_test_equality(version, dataset_name, record_set_name, num_records) |
273 | | - |
274 | | - |
275 | | -# Non-hermetic test cases for croissant 1.0 only (data from the internet). |
276 | | -@pytest.mark.nonhermetic |
277 | | -@pytest.mark.parametrize( |
278 | | - ["dataset_name", "record_set_name", "num_records", "filters"], |
279 | | - [ |
280 | | - ["huggingface-anthropic-hh-rlhf/metadata.json", "red-team-attempts", 10, None], |
281 | | - ["huggingface-c4/metadata.json", "data", 1, {"data/variant": "en"}], |
282 | | - ["huggingface-levanti/metadata.json", "levanti_train", 10, None], |
283 | | - ["huggingface-open-hermes/metadata.json", "default", 3, None], |
284 | | - # This dataset will timeout if the following feature is broken: mlcroissant |
285 | | - # yields examples by downloading parquet files one by one. mlcroissant should |
286 | | - # not download all parquet files upfront. |
287 | | - [ |
288 | | - "https://huggingface.co/api/datasets/bigcode/the-stack-metadata/croissant", |
289 | | - "default", |
290 | | - 1, |
291 | | - {"default/split": "train"}, |
292 | | - ], |
293 | | - ], |
294 | | -) |
295 | | -def test_nonhermetic_loading_1_0(dataset_name, record_set_name, num_records, filters): |
296 | | - load_records_and_test_equality( |
297 | | - "1.0", dataset_name, record_set_name, num_records, filters |
298 | | - ) |
299 | | - |
300 | | - |
301 | | -@pytest.mark.nonhermetic |
302 | | -def test_load_from_huggingface(): |
303 | | - url = "https://huggingface.co/api/datasets/mnist/croissant" |
304 | | - dataset = datasets.Dataset(url) |
305 | | - has_one_record = False |
306 | | - for record in dataset.records(record_set="mnist"): |
307 | | - assert record["mnist/label"] == 7 |
308 | | - assert isinstance(record["mnist/image"], deps.PIL_Image.Image) |
309 | | - has_one_record = True |
310 | | - break |
311 | | - assert has_one_record, ( |
312 | | - "mlc.Dataset.records() didn't yield any record. Warning: this test is" |
313 | | - " non-hermetic and makes an API call to Hugging Face, so it's prone to network" |
314 | | - " failure." |
315 | | - ) |
316 | | - |
317 | | - |
318 | 249 | @parametrize_version() |
319 | 250 | def test_raises_when_the_record_set_does_not_exist(version): |
320 | 251 | dataset_folder = _REPOSITORY_FOLDER / "datasets" / version / "titanic" |
|
0 commit comments