Skip to content

Commit b9dea2e

Browse files
authored
Split hermetic and non-hermetic tests. (#864)
1 parent 508c46d commit b9dea2e

File tree

2 files changed

+75
-69
lines changed

2 files changed

+75
-69
lines changed
Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
"""datasets_nonhermetic_test module with data from the internet."""
2+
3+
import pytest
4+
5+
from mlcroissant._src import datasets
6+
from mlcroissant._src.core.optional import deps
7+
from mlcroissant._src.datasets_test import load_records_and_test_equality
8+
from mlcroissant._src.tests.versions import parametrize_version
9+
10+
11+
@pytest.mark.nonhermetic
12+
@parametrize_version()
13+
@pytest.mark.parametrize(
14+
["dataset_name", "record_set_name", "num_records"],
15+
[
16+
[
17+
"flores-200/metadata.json",
18+
"language_translations_train_data_with_metadata",
19+
10,
20+
],
21+
[
22+
"flores-200/metadata.json",
23+
"language_translations_test_data_with_metadata",
24+
10,
25+
],
26+
["gpt-3/metadata.json", "default", 10],
27+
["huggingface-mnist/metadata.json", "default", 10],
28+
["titanic/metadata.json", "passengers", -1],
29+
],
30+
)
31+
def test_nonhermetic_loading(version, dataset_name, record_set_name, num_records):
32+
load_records_and_test_equality(version, dataset_name, record_set_name, num_records)
33+
34+
35+
# Non-hermetic test cases for croissant 1.0 only (data from the internet).
36+
@pytest.mark.nonhermetic
37+
@pytest.mark.parametrize(
38+
["dataset_name", "record_set_name", "num_records", "filters"],
39+
[
40+
["huggingface-anthropic-hh-rlhf/metadata.json", "red-team-attempts", 10, None],
41+
["huggingface-c4/metadata.json", "data", 1, {"data/variant": "en"}],
42+
["huggingface-levanti/metadata.json", "levanti_train", 10, None],
43+
["huggingface-open-hermes/metadata.json", "default", 3, None],
44+
# This dataset will timeout if the following feature is broken: mlcroissant
45+
# yields examples by downloading parquet files one by one. mlcroissant should
46+
# not download all parquet files upfront.
47+
[
48+
"https://huggingface.co/api/datasets/bigcode/the-stack-metadata/croissant",
49+
"default",
50+
1,
51+
{"default/split": "train"},
52+
],
53+
],
54+
)
55+
def test_nonhermetic_loading_1_0(dataset_name, record_set_name, num_records, filters):
56+
load_records_and_test_equality(
57+
"1.0", dataset_name, record_set_name, num_records, filters
58+
)
59+
60+
61+
@pytest.mark.nonhermetic
62+
def test_load_from_huggingface():
63+
url = "https://huggingface.co/api/datasets/mnist/croissant"
64+
dataset = datasets.Dataset(url)
65+
has_one_record = False
66+
for record in dataset.records(record_set="mnist"):
67+
assert record["mnist/label"] == 7
68+
assert isinstance(record["mnist/image"], deps.PIL_Image.Image)
69+
has_one_record = True
70+
break
71+
assert has_one_record, (
72+
"mlc.Dataset.records() didn't yield any record. Warning: this test is"
73+
" non-hermetic and makes an API call to Hugging Face, so it's prone to network"
74+
" failure."
75+
)

python/mlcroissant/mlcroissant/_src/datasets_test.py

Lines changed: 0 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@
1111
from mlcroissant._src import datasets
1212
from mlcroissant._src.beam import ReadFromCroissant
1313
from mlcroissant._src.core.issues import ValidationError
14-
from mlcroissant._src.core.optional import deps
1514
from mlcroissant._src.tests.records import record_to_python
1615
from mlcroissant._src.tests.versions import parametrize_version
1716

@@ -247,74 +246,6 @@ def test_hermetic_loading_1_1(dataset_name, record_set_name, num_records, filter
247246
)
248247

249248

250-
# Non-hermetic test cases (data from the internet).
251-
@pytest.mark.nonhermetic
252-
@parametrize_version()
253-
@pytest.mark.parametrize(
254-
["dataset_name", "record_set_name", "num_records"],
255-
[
256-
[
257-
"flores-200/metadata.json",
258-
"language_translations_train_data_with_metadata",
259-
10,
260-
],
261-
[
262-
"flores-200/metadata.json",
263-
"language_translations_test_data_with_metadata",
264-
10,
265-
],
266-
["gpt-3/metadata.json", "default", 10],
267-
["huggingface-mnist/metadata.json", "default", 10],
268-
["titanic/metadata.json", "passengers", -1],
269-
],
270-
)
271-
def test_nonhermetic_loading(version, dataset_name, record_set_name, num_records):
272-
load_records_and_test_equality(version, dataset_name, record_set_name, num_records)
273-
274-
275-
# Non-hermetic test cases for croissant 1.0 only (data from the internet).
276-
@pytest.mark.nonhermetic
277-
@pytest.mark.parametrize(
278-
["dataset_name", "record_set_name", "num_records", "filters"],
279-
[
280-
["huggingface-anthropic-hh-rlhf/metadata.json", "red-team-attempts", 10, None],
281-
["huggingface-c4/metadata.json", "data", 1, {"data/variant": "en"}],
282-
["huggingface-levanti/metadata.json", "levanti_train", 10, None],
283-
["huggingface-open-hermes/metadata.json", "default", 3, None],
284-
# This dataset will timeout if the following feature is broken: mlcroissant
285-
# yields examples by downloading parquet files one by one. mlcroissant should
286-
# not download all parquet files upfront.
287-
[
288-
"https://huggingface.co/api/datasets/bigcode/the-stack-metadata/croissant",
289-
"default",
290-
1,
291-
{"default/split": "train"},
292-
],
293-
],
294-
)
295-
def test_nonhermetic_loading_1_0(dataset_name, record_set_name, num_records, filters):
296-
load_records_and_test_equality(
297-
"1.0", dataset_name, record_set_name, num_records, filters
298-
)
299-
300-
301-
@pytest.mark.nonhermetic
302-
def test_load_from_huggingface():
303-
url = "https://huggingface.co/api/datasets/mnist/croissant"
304-
dataset = datasets.Dataset(url)
305-
has_one_record = False
306-
for record in dataset.records(record_set="mnist"):
307-
assert record["mnist/label"] == 7
308-
assert isinstance(record["mnist/image"], deps.PIL_Image.Image)
309-
has_one_record = True
310-
break
311-
assert has_one_record, (
312-
"mlc.Dataset.records() didn't yield any record. Warning: this test is"
313-
" non-hermetic and makes an API call to Hugging Face, so it's prone to network"
314-
" failure."
315-
)
316-
317-
318249
@parametrize_version()
319250
def test_raises_when_the_record_set_does_not_exist(version):
320251
dataset_folder = _REPOSITORY_FOLDER / "datasets" / version / "titanic"

0 commit comments

Comments
 (0)