11from typing import List
22import os
3+ import gc
34
45import pytest
56import polars as pl
67
7- from dataset import clean_duplicated_rows_df , open_csv , save_df , get_duplicated_files_list_by_diff
8+ from dataset import (
9+ clean_duplicated_rows_df ,
10+ open_csv ,
11+ save_df ,
12+ start_df ,
13+ get_duplicated_files_list_by_diff
14+ )
15+ from utils .datatypes import df_schema
16+
817
918@pytest .fixture ()
1019def base_df () -> str :
@@ -13,6 +22,7 @@ def base_df() -> str:
1322 """
1423 return os .path .join ("." , "tests" , "dataset.csv" )
1524
25+
1626@pytest .fixture ()
1727def tmp_df () -> str :
1828 """
@@ -52,15 +62,60 @@ def duplicated_files() -> List[str]:
5262
5363
5464@pytest .fixture (autouse = True )
55- def clear_file (tmp_df ):
65+ def clear_file (tmp_df , tmp_df2 ):
5666 """
5767 Clear tmp csv file
5868 """
59- if (not os .path .exists (tmp_df )):
60- return
61- os .remove (tmp_df )
69+ if (os .path .exists (tmp_df )):
70+ os .remove (tmp_df )
71+ if (os .path .exists (tmp_df2 )):
72+ os .remove (tmp_df2 )
73+
74+ class TestCSVFile :
75+ def test_open_csv (self ,base_df ):
76+ """
77+ Should open the df with no problems and cast it
78+ to correct data types.
79+ """
80+ df = open_csv (base_df ).collect ()
81+
82+ assert len (df ) == 11
83+ assert df .schema == df_schema
84+
85+ def test_gen_df_no_previous_file (self ,tmp_df ):
86+ """
87+ should create a new csv file.
88+ """
89+
90+ assert not os .path .exists (tmp_df )
91+ start_df (tmp_df )
92+ assert os .path .exists (tmp_df )
93+
94+ df_data = pl .read_csv (tmp_df )
95+ assert len (df_data ) == 0
96+
97+ def test_gen_df_file_already_exists (self ,base_df ,tmp_df ):
98+ """
99+ Should not overwrite the existent file.
100+ """
101+
102+ df = pl .read_csv (base_df )
103+ df .write_csv (tmp_df )
104+
105+ assert os .path .exists (tmp_df )
106+ assert len (pl .read_csv (tmp_df )) == 11
107+ start_df (tmp_df )
108+ assert os .path .exists (tmp_df )
109+ assert len (pl .read_csv (tmp_df )) == 11
110+
111+
112+
113+
114+
115+
116+ class TestDatasetClean :
117+
62118
63- class TestDatasetGeneration :
64119 """Test dataset generation parts"""
65120
66121 def test_clean_duplicated_rows_return_the_correct_of_rows (self , base_df ):
@@ -123,7 +178,7 @@ def test_save_df_with_modifications_different_files_and_rename(self, base_df, tm
123178
124179 assert len (target_csv ) == 8
125180
126- def test_get_duplicated_files_list_by_diff (self ,base_df , duplicated_files ):
181+ def test_get_duplicated_files_list_by_diff (self , base_df , duplicated_files ):
127182 """
128183 Must take the diff between the raw csv and the cleaned one
129184 and return a list of files that are duplicated and must be
@@ -136,6 +191,48 @@ def test_get_duplicated_files_list_by_diff(self,base_df, duplicated_files):
136191
137192 assert files_list == duplicated_files
138193
194+ def test_remove_duplicates_sequence (self , base_df , tmp_df , tmp_df2 ):
195+ """
196+ We must be able to run the entire clean up sequence without losing
197+ any data.
198+ """
199+
200+ df = pl .read_csv (base_df )
201+ df .write_csv (tmp_df )
202+
203+ del df
204+ gc .collect ()
205+
206+
207+ df = open_csv (tmp_df )
208+ assert len (df .collect ()) == 11
209+ clean_df = clean_duplicated_rows_df (df )
210+ assert len (clean_df .collect ()) == 8
211+ duplicated_files = get_duplicated_files_list_by_diff (df , clean_df )
212+ assert len (duplicated_files ) == 3
213+
214+ save_df (clean_df , tmp_df2 )
215+
216+ assert os .path .exists (tmp_df2 )
217+ assert len (pl .read_csv (tmp_df2 )) == 8
218+
219+ os .remove (tmp_df )
220+ os .rename (tmp_df2 , tmp_df )
221+
222+ assert os .path .exists (tmp_df )
223+ assert len (pl .read_csv (tmp_df )) == 8
224+ assert not os .path .exists (tmp_df2 )
225+
226+ del df
227+ del clean_df
228+ gc .collect ()
229+
230+ assert os .path .exists (tmp_df )
231+ assert len (pl .read_csv (tmp_df )) == 8
232+
233+
234+
235+
139236
140237
141238 # SINCE SAVING A LAZY FRAME AS CSV IN THE SAME FILE IS NOT STABLE,
0 commit comments