-
Notifications
You must be signed in to change notification settings - Fork 124
/
Copy pathtest_dataset.py
79 lines (65 loc) · 2.75 KB
/
test_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import dask.dataframe as dd
import pandas as pd
import pytest
from nemo_curator.datasets import DocumentDataset
from nemo_curator.utils.import_utils import gpu_only_import
cudf = gpu_only_import("cudf")
dask_cudf = gpu_only_import("dask_cudf")
def all_equal(left_result: pd.DataFrame, right_result: pd.DataFrame, gpu=True):
l_cols = set(left_result.columns)
r_cols = set(right_result.columns)
assert l_cols == r_cols
for col in left_result.columns:
left = left_result[col].reset_index(drop=True)
right = right_result[col].reset_index(drop=True)
# The `all` function expects an iterable, so we need to convert cuDF to Pandas
if gpu:
left = left.to_pandas()
right = right.to_pandas()
assert all(left == right), f"Mismatch in {col} column.\n{left}\n{right}\n"
class TestDocumentDataset:
def test_to_from_pandas(self):
original_df = pd.DataFrame(
{"first_col": [1, 2, 3], "second_col": ["a", "b", "c"]}
)
dataset = DocumentDataset.from_pandas(original_df)
converted_df = dataset.to_pandas()
pd.testing.assert_frame_equal(original_df, converted_df)
def test_init_pandas(self):
original_df = pd.DataFrame(
{"first_col": [1, 2, 3], "second_col": ["a", "b", "c"]}
)
with pytest.raises(RuntimeError):
dataset = DocumentDataset(dataset_df=original_df)
def test_init_dask(self):
original_df = pd.DataFrame(
{"first_col": [1, 2, 3], "second_col": ["a", "b", "c"]}
)
ddf = dd.from_pandas(original_df, npartitions=1)
dataset = DocumentDataset(dataset_df=ddf)
assert type(dataset.df == dd.DataFrame)
pd.testing.assert_frame_equal(original_df, dataset.df.compute())
@pytest.mark.gpu
def test_to_from_cudf(self):
original_df = cudf.DataFrame(
{"first_col": [1, 2, 3], "second_col": ["a", "b", "c"]}
)
dataset = DocumentDataset.from_cudf(original_df)
converted_df = dataset.to_cudf()
all_equal(original_df, converted_df, gpu=True)
@pytest.mark.gpu
def test_init_cudf(self):
original_df = cudf.DataFrame(
{"first_col": [1, 2, 3], "second_col": ["a", "b", "c"]}
)
with pytest.raises(RuntimeError):
dataset = DocumentDataset(dataset_df=original_df)
@pytest.mark.gpu
def test_init_dask_cudf(self):
original_df = cudf.DataFrame(
{"first_col": [1, 2, 3], "second_col": ["a", "b", "c"]}
)
ddf = dask_cudf.from_cudf(original_df, npartitions=1)
dataset = DocumentDataset(dataset_df=ddf)
assert type(dataset.df == dask_cudf.DataFrame)
all_equal(original_df, dataset.df.compute(), gpu=True)