fixed dataset read

Dpbm · Dpbm · commit d9ed7a2cd152 · 2025-08-08T22:18:33.000-03:00
diff --git a/.gitignore b/.gitignore
@@ -11,4 +11,4 @@ model_*
 *.jpeg
 history.json
 data/
-!tests/dataset.csv
+!tests/dataset.csv
diff --git a/dags/dataset.py b/dags/dataset.py
@@ -29,6 +29,7 @@
     DEFAULT_AMOUNT_OF_CIRCUITS,
     DEFAULT_THREADS,
     images_gen_checkpoint_file,
+    dataset_file
 )
 from ghz import gen_circuit
 from export.kaggle import upload_dataset as upload_dataset_kaggle
@@ -106,7 +107,7 @@ def update_checkpoint(checkpoint: Checkpoint, stage: Stages):
     """
 
     gen_df = PythonOperator(
-        task_id="gen_df", python_callable=start_df, op_args=[folder]
+        task_id="gen_df", python_callable=start_df, op_args=[dataset_file(folder)]
     )
     gen_df.doc_md = """
     Generate an empty dataframe and saves it as an csv file.
diff --git a/dataset.py b/dataset.py
@@ -384,8 +384,6 @@ def get_duplicated_files_list_by_diff(df:pl.LazyFrame, clean_df:pl.LazyFrame) ->
     """
     Get the files that are duplicated by applying a df diff.
     """
-    
-
     duplicated_files = df.join(clean_df, on=df.collect_schema().names(), how="anti").collect().get_column("file") 
     
     return duplicated_files.to_list() # type: ignore
@@ -467,15 +465,18 @@ def append_rows_to_df(file_path: FilePath, rows: Rows):
         writer.writerows(rows)
 
 
-def start_df(base_file_path: FilePath):
+def start_df(filename:FilePath):
     """
     generates an empty df and saves it on a csv file.
 
     It's not a good idea to use the scan_csv+sink_csv, but for
     an empty lazyFrame it works well.
     """
+    if(os.path.exists(filename)):
+        return
+
     df = create_df()
-    save_df(df, dataset_file(base_file_path))
+    save_df(df, filename)
 
     del df
     gc.collect()
@@ -484,7 +485,7 @@ def start_df(base_file_path: FilePath):
 def main(args: Arguments):
     """generate, clean and save dataset and images"""
 
-    crate_dataset_folder(args.target_folder)
+    crate_dataset_folder(dataset_file(args.target_folder))
 
     start_df(args.target_folder)
 
diff --git a/tests/test_dataset_generation.py b/tests/test_dataset_generation.py
@@ -1,10 +1,19 @@
 from typing import List
 import os
+import gc
 
 import pytest
 import polars as pl
 
-from dataset import clean_duplicated_rows_df, open_csv, save_df, get_duplicated_files_list_by_diff
+from dataset import (
+    clean_duplicated_rows_df, 
+    open_csv, 
+    save_df, 
+    start_df,
+    get_duplicated_files_list_by_diff
+)
+from utils.datatypes import df_schema
+
 
 @pytest.fixture()
 def base_df() -> str:
@@ -13,6 +22,7 @@ def base_df() -> str:
     """
     return os.path.join(".", "tests", "dataset.csv")
 
+
 @pytest.fixture()
 def tmp_df() -> str:
     """
@@ -52,15 +62,60 @@ def duplicated_files() -> List[str]:
 
 
 @pytest.fixture(autouse=True)
-def clear_file(tmp_df):
+def clear_file(tmp_df, tmp_df2):
     """
     Clear tmp csv file
     """
-    if(not os.path.exists(tmp_df)): 
-        return
-    os.remove(tmp_df)
+    if(os.path.exists(tmp_df)): 
+        os.remove(tmp_df)
+    if(os.path.exists(tmp_df2)): 
+        os.remove(tmp_df2)
+
+class TestCSVFile:
+    def test_open_csv(self,base_df):
+        """
+        Should open the df with no problems and cast it 
+        to correct data types.
+        """
+        df = open_csv(base_df).collect()
+
+        assert len(df) == 11
+        assert df.schema == df_schema
+
+    def test_gen_df_no_previous_file(self,tmp_df):
+        """
+        should create a new csv file.
+        """
+
+        assert not os.path.exists(tmp_df)
+        start_df(tmp_df)
+        assert os.path.exists(tmp_df)
+
+        df_data = pl.read_csv(tmp_df)
+        assert len(df_data) == 0
+
+    def test_gen_df_file_already_exists(self,base_df,tmp_df):
+        """
+        Should not overwrite the existent file.
+        """
+
+        df = pl.read_csv(base_df)
+        df.write_csv(tmp_df)
+
+        assert os.path.exists(tmp_df)
+        assert len(pl.read_csv(tmp_df)) == 11
+        start_df(tmp_df)
+        assert os.path.exists(tmp_df)
+        assert len(pl.read_csv(tmp_df)) == 11
+
+
+
+
+
+
+class TestDatasetClean:
+
 
-class TestDatasetGeneration:
     """Test dataset generation parts"""
 
     def test_clean_duplicated_rows_return_the_correct_of_rows(self, base_df):
@@ -123,7 +178,7 @@ def test_save_df_with_modifications_different_files_and_rename(self, base_df, tm
 
         assert len(target_csv) == 8
 
-    def test_get_duplicated_files_list_by_diff(self,base_df, duplicated_files):
+    def test_get_duplicated_files_list_by_diff(self, base_df, duplicated_files):
         """
         Must take the diff between the raw csv and the cleaned one
         and return a list of files that are duplicated and must be
@@ -136,6 +191,48 @@ def test_get_duplicated_files_list_by_diff(self,base_df, duplicated_files):
 
         assert files_list == duplicated_files
 
+    def test_remove_duplicates_sequence(self, base_df, tmp_df, tmp_df2):
+        """
+        We must be able to run the entire clean up sequence without losing
+        any data.
+        """
+
+        df = pl.read_csv(base_df)
+        df.write_csv(tmp_df)
+
+        del df
+        gc.collect()
+
+
+        df = open_csv(tmp_df)
+        assert len(df.collect()) == 11
+        clean_df = clean_duplicated_rows_df(df)
+        assert len(clean_df.collect()) == 8
+        duplicated_files = get_duplicated_files_list_by_diff(df, clean_df)
+        assert len(duplicated_files) == 3
+
+        save_df(clean_df, tmp_df2)
+
+        assert os.path.exists(tmp_df2)
+        assert len(pl.read_csv(tmp_df2)) == 8
+
+        os.remove(tmp_df)
+        os.rename(tmp_df2, tmp_df)
+        
+        assert os.path.exists(tmp_df)
+        assert len(pl.read_csv(tmp_df)) == 8
+        assert not os.path.exists(tmp_df2)
+
+        del df
+        del clean_df
+        gc.collect()
+
+        assert os.path.exists(tmp_df)
+        assert len(pl.read_csv(tmp_df)) == 8
+
+
+
+
 
 
     # SINCE SAVING A LAZY FRAME AS CSV IN THE SAME FILE IS NOT STABLE,