Skip to content

Commit 6ba14a5

Browse files
committed
dev: update
1 parent 9e0b0c5 commit 6ba14a5

10 files changed

+19
-77
lines changed

Diff for: .github/workflows/tests.yaml

-1
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@ jobs:
1616
uses: actions/setup-python@v5
1717
with:
1818
python-version: ${{ matrix.python-version }}
19-
# You can test your matrix by printing the current Python version
2019
- name: Display Python version
2120
run: python -c "import sys; print(sys.version)"
2221
- name: Install modules

Diff for: data_flow/data_flow.py

+4-25
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,6 @@
2424
)
2525
from data_flow.lib.fireducks import from_fireducks_2_file, to_fireducks_from_file
2626
from data_flow.lib.pandas import from_pandas_2_file
27-
from data_flow.lib.polars import from_polars_2_file, to_polars_from_file
2827
from data_flow.lib.tools import generate_temporary_filename, delete_file
2928

3029

@@ -77,14 +76,16 @@ def from_polars(self, df: pl.DataFrame):
7776
if self.__in_memory:
7877
self.__data = fd.from_pandas(df.to_pandas())
7978
else:
80-
from_polars_2_file(df=df, tmp_filename=self.__filename, file_type=self.__file_type)
79+
from_pandas_2_file(df=df.to_pandas(), tmp_filename=self.__filename, file_type=self.__file_type)
8180
return self
8281

8382
def to_polars(self) -> pl.DataFrame:
8483
if self.__in_memory:
8584
return pl.from_pandas(self.__data.to_pandas())
8685
else:
87-
return to_polars_from_file(tmp_filename=self.__filename, file_type=self.__file_type)
86+
return pl.from_pandas(
87+
to_fireducks_from_file(tmp_filename=self.__filename, file_type=self.__file_type).to_pandas()
88+
)
8889

8990
def from_csv(self, filename: str):
9091
if self.__in_memory:
@@ -156,28 +157,6 @@ def to_hdf(self, filename: str, key: str = "key"):
156157
to_hdf_from_file(filename=filename, tmp_filename=self.__filename, file_type=self.__file_type, key=key)
157158
return self
158159

159-
def head(self):
160-
if self.__in_memory:
161-
print(self.__data.head())
162-
else:
163-
print(to_fireducks_from_file(tmp_filename=self.__filename, file_type=self.__file_type).head())
164-
return self
165-
166-
def stats(self):
167-
if self.__in_memory:
168-
data = self.__data
169-
else:
170-
data = to_fireducks_from_file(tmp_filename=self.__filename, file_type=self.__file_type)
171-
172-
print("***** Data stats *****")
173-
print(f"Columns names : {data.columns.to_list()}")
174-
print(f"Columns count : {len(data.columns)}")
175-
print(f"Rows count : {len(data)}")
176-
print("Data types :")
177-
print(data.dtypes)
178-
print("**********************")
179-
return self
180-
181160
def del_columns(self, columns: list):
182161
if self.__in_memory:
183162
self.__data.drop(columns=columns, inplace=True)

Diff for: data_flow/lib/pandas.py

-10
Original file line numberDiff line numberDiff line change
@@ -12,13 +12,3 @@ def from_pandas_2_file(df: pd.DataFrame, tmp_filename: str, file_type: FileType)
1212
fd.from_pandas(df).to_feather(tmp_filename)
1313
case _:
1414
raise ValueError(f"File type not implemented: {file_type} !")
15-
16-
17-
def to_pandas_from_file(tmp_filename: str, file_type: FileType) -> fd.DataFrame:
18-
match file_type:
19-
case FileType.parquet:
20-
return pd.read_parquet(tmp_filename)
21-
case FileType.feather:
22-
return pd.read_feather(tmp_filename)
23-
case _:
24-
raise ValueError(f"File type not implemented: {file_type} !")

Diff for: data_flow/lib/polars.py

-12
This file was deleted.

Diff for: tests/SequenceTestCase.py

+2
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44

55
class SequenceTestCase(BaseTestCase):
66
def _sequence(self, data: DataFlow.DataFrame) -> None:
7+
self.assertPandasEqual(data.to_pandas(), DataFlow().DataFrame().from_csv(self.CSV_FILE).to_pandas())
8+
79
polars = data.to_polars()
810

911
self.assertEqual(10, len(data.columns()))

Diff for: tests/test_data_flow_csv.py

+10-17
Original file line numberDiff line numberDiff line change
@@ -2,36 +2,29 @@
22

33
from data_flow import DataFlow
44
from data_flow.lib import FileType
5+
from data_flow.lib.tools import delete_file
56
from tests.SequenceTestCase import SequenceTestCase
67

78

89
class DataFlowCSVTestCase(SequenceTestCase):
10+
def setUp(self):
11+
super().setUp()
12+
delete_file(self.TEST_CSV_FILE)
13+
DataFlow().DataFrame().from_csv(self.CSV_FILE).to_csv(self.TEST_CSV_FILE)
14+
915
def test_memory(self):
10-
df = (
11-
DataFlow().DataFrame().from_csv("./tests/data/annual-enterprise-survey-2023-financial-year-provisional.csv")
12-
)
13-
df.to_csv(self.TEST_CSV_FILE)
14-
self.assertPandasEqual(df.to_pandas(), DataFlow().DataFrame().from_csv(self.CSV_FILE).to_pandas())
16+
df = DataFlow().DataFrame().from_csv(self.TEST_CSV_FILE)
17+
1518
self._sequence(data=df)
1619

1720
def test_parquet(self):
18-
df = (
19-
DataFlow()
20-
.DataFrame(in_memory=False)
21-
.from_csv("./tests/data/annual-enterprise-survey-2023-financial-year-provisional.csv")
22-
)
21+
df = DataFlow().DataFrame(in_memory=False).from_csv(self.TEST_CSV_FILE)
2322

24-
self.assertPandasEqual(df.to_pandas(), DataFlow().DataFrame().from_csv(self.CSV_FILE).to_pandas())
2523
self._sequence(data=df)
2624

2725
def test_feather(self):
28-
df = (
29-
DataFlow()
30-
.DataFrame(in_memory=False, file_type=FileType.feather)
31-
.from_csv("./tests/data/annual-enterprise-survey-2023-financial-year-provisional.csv")
32-
)
26+
df = DataFlow().DataFrame(in_memory=False, file_type=FileType.feather).from_csv(self.TEST_CSV_FILE)
3327

34-
self.assertPandasEqual(df.to_pandas(), DataFlow().DataFrame().from_csv(self.CSV_FILE).to_pandas())
3528
self._sequence(data=df)
3629

3730

Diff for: tests/test_data_flow_feather.py

-3
Original file line numberDiff line numberDiff line change
@@ -15,19 +15,16 @@ def setUp(self):
1515
def test_memory(self):
1616
df = DataFlow().DataFrame().from_feather(self.TEST_FEATHER_FILE)
1717

18-
self.assertPandasEqual(df.to_pandas(), DataFlow().DataFrame().from_csv(self.CSV_FILE).to_pandas())
1918
self._sequence(data=df)
2019

2120
def test_parquet(self):
2221
df = DataFlow().DataFrame(in_memory=False).from_feather(self.TEST_FEATHER_FILE)
2322

24-
self.assertPandasEqual(df.to_pandas(), DataFlow().DataFrame().from_csv(self.CSV_FILE).to_pandas())
2523
self._sequence(data=df)
2624

2725
def test_feather(self):
2826
df = DataFlow().DataFrame(in_memory=False, file_type=FileType.feather).from_feather(self.TEST_FEATHER_FILE)
2927

30-
self.assertPandasEqual(df.to_pandas(), DataFlow().DataFrame().from_csv(self.CSV_FILE).to_pandas())
3128
self._sequence(data=df)
3229

3330

Diff for: tests/test_data_flow_hdf.py

-3
Original file line numberDiff line numberDiff line change
@@ -15,19 +15,16 @@ def setUp(self):
1515
def test_memory(self):
1616
df = DataFlow().DataFrame().from_hdf(self.TEST_HDF_FILE)
1717

18-
self.assertPandasEqual(df.to_pandas(), DataFlow().DataFrame().from_csv(self.CSV_FILE).to_pandas())
1918
self._sequence(data=df)
2019

2120
def test_parquet(self):
2221
df = DataFlow().DataFrame(in_memory=False).from_hdf(self.TEST_HDF_FILE)
2322

24-
self.assertPandasEqual(df.to_pandas(), DataFlow().DataFrame().from_csv(self.CSV_FILE).to_pandas())
2523
self._sequence(data=df)
2624

2725
def test_feather(self):
2826
df = DataFlow().DataFrame(in_memory=False, file_type=FileType.feather).from_hdf(self.TEST_HDF_FILE)
2927

30-
self.assertPandasEqual(df.to_pandas(), DataFlow().DataFrame().from_csv(self.CSV_FILE).to_pandas())
3128
self._sequence(data=df)
3229

3330

Diff for: tests/test_data_flow_json.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -14,17 +14,17 @@ def setUp(self):
1414

1515
def test_memory(self):
1616
df = DataFlow().DataFrame().from_json(self.TEST_JSON_FILE)
17-
self.assertPandasEqual(df.to_pandas(), DataFlow().DataFrame().from_csv(self.CSV_FILE).to_pandas())
17+
1818
self._sequence(data=df)
1919

2020
def test_parquet(self):
2121
df = DataFlow().DataFrame(in_memory=False).from_json(self.TEST_JSON_FILE)
22-
self.assertPandasEqual(df.to_pandas(), DataFlow().DataFrame().from_csv(self.CSV_FILE).to_pandas())
22+
2323
self._sequence(data=df)
2424

2525
def test_feather(self):
2626
df = DataFlow().DataFrame(in_memory=False, file_type=FileType.feather).from_json(self.TEST_JSON_FILE)
27-
self.assertPandasEqual(df.to_pandas(), DataFlow().DataFrame().from_csv(self.CSV_FILE).to_pandas())
27+
2828
self._sequence(data=df)
2929

3030

Diff for: tests/test_data_flow_parquet.py

-3
Original file line numberDiff line numberDiff line change
@@ -15,19 +15,16 @@ def setUp(self):
1515
def test_memory(self):
1616
df = DataFlow().DataFrame().from_parquet(self.TEST_PARQUET_FILE)
1717

18-
self.assertPandasEqual(df.to_pandas(), DataFlow().DataFrame().from_csv(self.CSV_FILE).to_pandas())
1918
self._sequence(data=df)
2019

2120
def test_parquet(self):
2221
df = DataFlow().DataFrame(in_memory=False).from_parquet(self.TEST_PARQUET_FILE)
2322

24-
self.assertPandasEqual(df.to_pandas(), DataFlow().DataFrame().from_csv(self.CSV_FILE).to_pandas())
2523
self._sequence(data=df)
2624

2725
def test_feather(self):
2826
df = DataFlow().DataFrame(in_memory=False, file_type=FileType.feather).from_parquet(self.TEST_PARQUET_FILE)
2927

30-
self.assertPandasEqual(df.to_pandas(), DataFlow().DataFrame().from_csv(self.CSV_FILE).to_pandas())
3128
self._sequence(data=df)
3229

3330

0 commit comments

Comments
 (0)