Skip to content

Commit b2fbfe9

Browse files
committed
dev: small refactor
1 parent c60172c commit b2fbfe9

14 files changed

+108
-130
lines changed

Diff for: .github/workflows/ci.yaml renamed to .github/workflows/tests.yaml

+7-5
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,14 @@
1-
name: Python data-flow
1+
name: Python data-flow Tests
22

3-
on: [push]
3+
on: [ push ]
44

55
jobs:
6-
build:
6+
tests:
77

88
runs-on: ubuntu-latest
99
strategy:
1010
matrix:
11-
python-version: ["3.10", "3.11", "3.12"]
11+
python-version: [ "3.10", "3.11", "3.12" ]
1212

1313
steps:
1414
- uses: actions/checkout@v4
@@ -22,4 +22,6 @@ jobs:
2222
- name: Install modules
2323
run: pip install -r requirements.txt && pip install -r requirements.dev.txt
2424
- name: Tests
25-
run: PYTHONPATH=. pytest --cov=data_flow --cov-report term
25+
run: PYTHONPATH=. pytest --cov=data_flow --cov-report term
26+
- name: Lint
27+
run: flake8 data_flow/

Diff for: Makefile

+1-1
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ pip::
88
venv/bin/pip install -r requirements.dev.txt
99

1010
tests::
11-
PYTHONPATH=. venv/bin/pytest -rP tests/ -vvv --cov=data_flow --cov-report html --cov-report term
11+
PYTHONPATH=. venv/bin/pytest --cov=data_flow --cov-report html --cov-report term -rP tests/ -vvv
1212

1313
lint::
1414
venv/bin/flake8 data_flow/

Diff for: README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# DataFlow
22

3-
![tests](https://github.com/mysiar-org/python-data-flow/actions/workflows/ci.yaml/badge.svg)
3+
![tests](https://github.com/mysiar-org/python-data-flow/actions/workflows/tests.yaml/badge.svg)
44
[![Python 3.10](https://img.shields.io/badge/python-3.10-blue.svg)](https://www.python.org/downloads/release/python-3100/)
55
[![Python 3.11](https://img.shields.io/badge/python-3.11-blue.svg)](https://www.python.org/downloads/release/python-3110/)
66
[![Python 3.12](https://img.shields.io/badge/python-3.12-blue.svg)](https://www.python.org/downloads/release/python-3120/)

Diff for: data_flow/data_flow.py

+11-2
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33

44
import fireducks.pandas as fd
55
import pandas as pd
6+
import polars as pl
67
from pyarrow import feather
78

89
from data_flow.lib import FileType
@@ -44,18 +45,26 @@ def __del__(self):
4445
if not self.__in_memory:
4546
delete_file(self.__filename)
4647

47-
def get_data_fireducks(self) -> fd.DataFrame:
48+
def to_fireducks(self) -> fd.DataFrame:
4849
if self.__in_memory:
4950
return self.__data
5051
else:
5152
return df_from_tmp_filename(tmp_filename=self.__filename, file_type=self.__file_type)
5253

53-
def get_data_pandas(self) -> pd.DataFrame:
54+
def to_pandas(self) -> pd.DataFrame:
5455
if self.__in_memory:
5556
return self.__data.to_pandas()
5657
else:
5758
return df_from_tmp_filename(tmp_filename=self.__filename, file_type=self.__file_type).to_pandas()
5859

60+
def to_polars(self) -> pl.DataFrame:
61+
if self.__in_memory:
62+
return pl.from_pandas(self.__data.to_pandas())
63+
else:
64+
return pl.from_pandas(
65+
df_from_tmp_filename(tmp_filename=self.__filename, file_type=self.__file_type).to_pandas()
66+
)
67+
5968
def from_csv(self, filename: str):
6069
if self.__in_memory:
6170
self.__data = fd.read_csv(filename)

Diff for: requirements.txt

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
fireducks
22
tables
33
pyarrow
4-
pandas
4+
pandas
5+
polars

Diff for: tests/BaseTestCase.py

+6-1
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,12 @@
11
import unittest
22
from zipfile import ZipFile
33

4+
import pandas as pd
5+
46

57
class BaseTestCase(unittest.TestCase):
68
def setUp(self):
7-
zip = ZipFile(self.ZIP_FILE).extractall("./tests/data")
9+
ZipFile(self.ZIP_FILE).extractall("./tests/data")
810

911
ZIP_FILE = "./tests/data/annual-enterprise-survey-2023-financial-year-provisional.zip"
1012
CSV_FILE = "./tests/data/annual-enterprise-survey-2023-financial-year-provisional.csv"
@@ -13,3 +15,6 @@ def setUp(self):
1315
TEST_CSV_FILE = "/tmp/data-flow.csv"
1416
TEST_JSON_FILE = "/tmp/data-flow.json"
1517
TEST_HDF_FILE = "/tmp/data-flow.h5"
18+
19+
def assertPandasEqual(self, df1: pd.DataFrame, df2: pd.DataFrame):
20+
self.assertTrue(df1.equals(df2), "Pandas DataFrames are not equal !")

Diff for: tests/SequenceTestCase.py

+1
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ def _sequence(self, data: DataFlow.DataFrame) -> None:
1919
"Variable_category",
2020
]
2121
)
22+
2223
self.assertEqual(3, len(data.columns()))
2324
self.assertListEqual(["Year", "Units", "Value"], data.columns())
2425

Diff for: tests/test_base_test_case.py

+22
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
import unittest
2+
3+
import pandas as pd
4+
5+
from tests.BaseTestCase import BaseTestCase
6+
7+
8+
class BaseTestCaseTestCase(BaseTestCase):
9+
def test_assert_pandas_equal(self):
10+
df1 = pd.DataFrame({"col1": [1, 2, 3], "col2": [4, 5, 6]})
11+
df2 = pd.DataFrame({"col1": [1, 2, 3], "col2": [4, 5, 6]})
12+
df3 = pd.DataFrame({"Name": ["Tom", "nick", "krish", "jack"], "Age": [20, 21, 19, 18]})
13+
14+
self.assertPandasEqual(df1, df2)
15+
16+
with self.assertRaises(AssertionError) as context:
17+
self.assertPandasEqual(df1, df3)
18+
self.assertEqual(str(context.exception), "False is not true : Pandas DataFrames are not equal !")
19+
20+
21+
if __name__ == "__main__":
22+
unittest.main()

Diff for: tests/test_data_flow.py

-83
This file was deleted.

Diff for: tests/test_data_flow_csv.py

+12-8
Original file line numberDiff line numberDiff line change
@@ -6,29 +6,33 @@
66

77
class DataFlowCSVTestCase(SequenceTestCase):
88
def test_memory(self):
9-
data = (
9+
df = (
1010
DataFlow().DataFrame().from_csv("./tests/data/annual-enterprise-survey-2023-financial-year-provisional.csv")
1111
)
12-
self._sequence(data=data)
12+
df.to_csv(self.TEST_CSV_FILE)
1313

14-
data.to_csv(self.TEST_CSV_FILE)
15-
data.get_data_pandas().equals(DataFlow().DataFrame().from_csv(self.TEST_CSV_FILE).get_data_pandas())
14+
self.assertPandasEqual(df.to_pandas(), DataFlow().DataFrame().from_csv(self.CSV_FILE).to_pandas())
15+
self._sequence(data=df)
1616

1717
def test_parquet(self):
18-
data = (
18+
df = (
1919
DataFlow()
2020
.DataFrame(in_memory=False)
2121
.from_csv("./tests/data/annual-enterprise-survey-2023-financial-year-provisional.csv")
2222
)
23-
self._sequence(data=data)
23+
24+
self.assertPandasEqual(df.to_pandas(), DataFlow().DataFrame().from_csv(self.CSV_FILE).to_pandas())
25+
self._sequence(data=df)
2426

2527
def test_feather(self):
26-
data = (
28+
df = (
2729
DataFlow()
2830
.DataFrame(in_memory=False, file_type=FileType.feather)
2931
.from_csv("./tests/data/annual-enterprise-survey-2023-financial-year-provisional.csv")
3032
)
31-
self._sequence(data=data)
33+
34+
self.assertPandasEqual(df.to_pandas(), DataFlow().DataFrame().from_csv(self.CSV_FILE).to_pandas())
35+
self._sequence(data=df)
3236

3337

3438
if __name__ == "__main__":

Diff for: tests/test_data_flow_feather.py

+12-7
Original file line numberDiff line numberDiff line change
@@ -12,17 +12,22 @@ def setUp(self):
1212
DataFlow().DataFrame().from_csv(self.CSV_FILE).to_feather(self.TEST_FEATHER_FILE)
1313

1414
def test_memory(self):
15-
data = DataFlow().DataFrame().from_feather(self.TEST_FEATHER_FILE)
16-
self._sequence(data=data)
17-
data.get_data_pandas().equals(DataFlow().DataFrame().from_csv(self.CSV_FILE).get_data_pandas())
15+
df = DataFlow().DataFrame().from_feather(self.TEST_FEATHER_FILE)
16+
17+
self.assertPandasEqual(df.to_pandas(), DataFlow().DataFrame().from_csv(self.CSV_FILE).to_pandas())
18+
self._sequence(data=df)
1819

1920
def test_parquet(self):
20-
data = DataFlow().DataFrame(in_memory=False).from_feather(self.TEST_FEATHER_FILE)
21-
self._sequence(data=data)
21+
df = DataFlow().DataFrame(in_memory=False).from_feather(self.TEST_FEATHER_FILE)
22+
23+
self.assertPandasEqual(df.to_pandas(), DataFlow().DataFrame().from_csv(self.CSV_FILE).to_pandas())
24+
self._sequence(data=df)
2225

2326
def test_feather(self):
24-
data = DataFlow().DataFrame(in_memory=False, file_type=FileType.feather).from_feather(self.TEST_FEATHER_FILE)
25-
self._sequence(data=data)
27+
df = DataFlow().DataFrame(in_memory=False, file_type=FileType.feather).from_feather(self.TEST_FEATHER_FILE)
28+
29+
self.assertPandasEqual(df.to_pandas(), DataFlow().DataFrame().from_csv(self.CSV_FILE).to_pandas())
30+
self._sequence(data=df)
2631

2732

2833
if __name__ == "__main__":

Diff for: tests/test_data_flow_hdf.py

+12-7
Original file line numberDiff line numberDiff line change
@@ -12,17 +12,22 @@ def setUp(self):
1212
DataFlow().DataFrame().from_csv(self.CSV_FILE).to_hdf(self.TEST_HDF_FILE)
1313

1414
def test_memory(self):
15-
data = DataFlow().DataFrame().from_hdf(self.TEST_HDF_FILE)
16-
self._sequence(data=data)
17-
data.get_data_pandas().equals(DataFlow().DataFrame().from_csv(self.CSV_FILE).get_data_pandas())
15+
df = DataFlow().DataFrame().from_hdf(self.TEST_HDF_FILE)
16+
17+
self.assertPandasEqual(df.to_pandas(), DataFlow().DataFrame().from_csv(self.CSV_FILE).to_pandas())
18+
self._sequence(data=df)
1819

1920
def test_parquet(self):
20-
data = DataFlow().DataFrame(in_memory=False).from_hdf(self.TEST_HDF_FILE)
21-
self._sequence(data=data)
21+
df = DataFlow().DataFrame(in_memory=False).from_hdf(self.TEST_HDF_FILE)
22+
23+
self.assertPandasEqual(df.to_pandas(), DataFlow().DataFrame().from_csv(self.CSV_FILE).to_pandas())
24+
self._sequence(data=df)
2225

2326
def test_feather(self):
24-
data = DataFlow().DataFrame(in_memory=False, file_type=FileType.feather).from_hdf(self.TEST_HDF_FILE)
25-
self._sequence(data=data)
27+
df = DataFlow().DataFrame(in_memory=False, file_type=FileType.feather).from_hdf(self.TEST_HDF_FILE)
28+
29+
self.assertPandasEqual(df.to_pandas(), DataFlow().DataFrame().from_csv(self.CSV_FILE).to_pandas())
30+
self._sequence(data=df)
2631

2732

2833
if __name__ == "__main__":

Diff for: tests/test_data_flow_json.py

+9-7
Original file line numberDiff line numberDiff line change
@@ -12,17 +12,19 @@ def setUp(self):
1212
DataFlow().DataFrame().from_csv(self.CSV_FILE).to_json(self.TEST_JSON_FILE)
1313

1414
def test_memory(self):
15-
data = DataFlow().DataFrame().from_json(self.TEST_JSON_FILE)
16-
self._sequence(data=data)
17-
data.get_data_pandas().equals(DataFlow().DataFrame().from_csv(self.CSV_FILE).get_data_pandas())
15+
df = DataFlow().DataFrame().from_json(self.TEST_JSON_FILE)
16+
self.assertPandasEqual(df.to_pandas(), DataFlow().DataFrame().from_csv(self.CSV_FILE).to_pandas())
17+
self._sequence(data=df)
1818

1919
def test_parquet(self):
20-
data = DataFlow().DataFrame(in_memory=False).from_json(self.TEST_JSON_FILE)
21-
self._sequence(data=data)
20+
df = DataFlow().DataFrame(in_memory=False).from_json(self.TEST_JSON_FILE)
21+
self.assertPandasEqual(df.to_pandas(), DataFlow().DataFrame().from_csv(self.CSV_FILE).to_pandas())
22+
self._sequence(data=df)
2223

2324
def test_feather(self):
24-
data = DataFlow().DataFrame(in_memory=False, file_type=FileType.feather).from_json(self.TEST_JSON_FILE)
25-
self._sequence(data=data)
25+
df = DataFlow().DataFrame(in_memory=False, file_type=FileType.feather).from_json(self.TEST_JSON_FILE)
26+
self.assertPandasEqual(df.to_pandas(), DataFlow().DataFrame().from_csv(self.CSV_FILE).to_pandas())
27+
self._sequence(data=df)
2628

2729

2830
if __name__ == "__main__":

Diff for: tests/test_data_flow_parquet.py

+12-7
Original file line numberDiff line numberDiff line change
@@ -12,17 +12,22 @@ def setUp(self):
1212
DataFlow().DataFrame().from_csv(self.CSV_FILE).to_parquet(self.TEST_PARQUET_FILE)
1313

1414
def test_memory(self):
15-
data = DataFlow().DataFrame().from_parquet(self.TEST_PARQUET_FILE)
16-
self._sequence(data=data)
17-
data.get_data_pandas().equals(DataFlow().DataFrame().from_csv(self.CSV_FILE).get_data_pandas())
15+
df = DataFlow().DataFrame().from_parquet(self.TEST_PARQUET_FILE)
16+
17+
self.assertPandasEqual(df.to_pandas(), DataFlow().DataFrame().from_csv(self.CSV_FILE).to_pandas())
18+
self._sequence(data=df)
1819

1920
def test_parquet(self):
20-
data = DataFlow().DataFrame(in_memory=False).from_parquet(self.TEST_PARQUET_FILE)
21-
self._sequence(data=data)
21+
df = DataFlow().DataFrame(in_memory=False).from_parquet(self.TEST_PARQUET_FILE)
22+
23+
self.assertPandasEqual(df.to_pandas(), DataFlow().DataFrame().from_csv(self.CSV_FILE).to_pandas())
24+
self._sequence(data=df)
2225

2326
def test_feather(self):
24-
data = DataFlow().DataFrame(in_memory=False, file_type=FileType.feather).from_parquet(self.TEST_PARQUET_FILE)
25-
self._sequence(data=data)
27+
df = DataFlow().DataFrame(in_memory=False, file_type=FileType.feather).from_parquet(self.TEST_PARQUET_FILE)
28+
29+
self.assertPandasEqual(df.to_pandas(), DataFlow().DataFrame().from_csv(self.CSV_FILE).to_pandas())
30+
self._sequence(data=df)
2631

2732

2833
if __name__ == "__main__":

0 commit comments

Comments
 (0)