Skip to content

Commit a825eef

Browse files
authored
Doc (#4)
## [0.0.2] - 2024-10-21 ### Added - typehints for self - doc strings
1 parent 1b5805c commit a825eef

File tree

7 files changed

+147
-66
lines changed

7 files changed

+147
-66
lines changed

Diff for: CHANGELOG.md

+19
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
# Changelog
2+
3+
All notable changes to this project will be documented in this file.
4+
5+
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
6+
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7+
8+
## [0.0.2] - 2024-10-21
9+
10+
### Added
11+
12+
- typehints for self
13+
- doc strings
14+
15+
## [0.0.1] - 2024-10-16
16+
17+
### Added
18+
19+
- initial version

Diff for: README.md

+1-9
Original file line numberDiff line numberDiff line change
@@ -21,17 +21,9 @@
2121

2222
library to manipulate data
2323

24-
## Installation instructions
24+
## Installation
2525

2626
```sh
2727
pip install mysiar-data-flow
2828
```
2929

30-
## DataFlow.DataFrame
31-
32-
### Usage
33-
For now check [mysiar_data_flow/data_flow.py](mysiar_data_flow/data_flow.py) file for interface
34-
35-
36-
37-
![work in progress](.github/5578703.png)

Diff for: Usage.md

+30
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
# Usage
2+
3+
## DataFlow.DataFrame
4+
5+
6+
Create empty data frame object in memory
7+
```python
8+
9+
from mysiar_data_flow import DataFlow
10+
11+
df = DataFlow().DataFrame()
12+
df.from_pandas(df=pandas_data_frame_obj)
13+
14+
```
15+
Create data frame object in memory from Pandas data frame
16+
```python
17+
18+
from mysiar_data_flow import DataFlow
19+
20+
df = DataFlow().DataFrame().from_pandas(df=pandas_data_frame_obj)
21+
```
22+
23+
24+
25+
---
26+
For more check [mysiar_data_flow/data_flow.py](https://github.com/mysiar-org/python-data-flow/blob/master/mysiar_data_flow/data_flow.py) file for interface
27+
28+
29+
30+
![work in progress](.github/5578703.png)

Diff for: mysiar_data_flow/__init__.py

+6
Original file line numberDiff line numberDiff line change
@@ -1 +1,7 @@
1+
"""
2+
.. include:: ../README.md
3+
.. include:: ../Usage.md
4+
.. include:: ../CHANGELOG.md
5+
"""
6+
17
from .data_flow import DataFlow

Diff for: mysiar_data_flow/data_flow.py

+88-55
Original file line numberDiff line numberDiff line change
@@ -53,132 +53,149 @@ def __del__(self):
5353
if not self.__in_memory:
5454
delete_file(self.__filename)
5555

56-
def from_fireducks(self, df: fd.DataFrame):
56+
def from_csv(self, filename: str) -> "DataFlow.DataFrame":
5757
if self.__in_memory:
58-
self.__data = df
58+
self.__data = fd.read_csv(filename)
5959
else:
60-
from_fireducks_2_file(df=df, tmp_filename=self.__filename, file_type=self.__file_type)
60+
from_csv_2_file(filename=filename, tmp_filename=self.__filename, file_type=self.__file_type)
6161
return self
6262

63-
def to_fireducks(self) -> fd.DataFrame:
63+
def from_feather(self, filename: str) -> "DataFlow.DataFrame":
6464
if self.__in_memory:
65-
return self.__data
65+
self.__data = fd.from_pandas(feather.read_feather(filename))
6666
else:
67-
return to_fireducks_from_file(tmp_filename=self.__filename, file_type=self.__file_type)
67+
from_feather_2_file(filename=filename, tmp_filename=self.__filename, file_type=self.__file_type)
68+
return self
6869

69-
def from_pandas(self, df: pd.DataFrame):
70+
def from_fireducks(self, df: fd.DataFrame) -> "DataFlow.DataFrame":
7071
if self.__in_memory:
71-
self.__data = fd.from_pandas(df)
72+
self.__data = df
7273
else:
73-
from_pandas_2_file(df=df, tmp_filename=self.__filename, file_type=self.__file_type)
74+
from_fireducks_2_file(df=df, tmp_filename=self.__filename, file_type=self.__file_type)
7475
return self
7576

76-
def to_pandas(self) -> pd.DataFrame:
77+
def from_hdf(self, filename: str) -> "DataFlow.DataFrame":
7778
if self.__in_memory:
78-
return self.__data.to_pandas()
79+
self.__data = fd.read_hdf(filename)
7980
else:
80-
return to_fireducks_from_file(tmp_filename=self.__filename, file_type=self.__file_type).to_pandas()
81+
from_hdf_2_file(filename=filename, tmp_filename=self.__filename, file_type=self.__file_type)
82+
return self
8183

82-
def from_polars(self, df: pl.DataFrame):
84+
def from_json(self, filename: str) -> "DataFlow.DataFrame":
8385
if self.__in_memory:
84-
self.__data = fd.from_pandas(df.to_pandas())
86+
self.__data = fd.read_json(filename)
8587
else:
86-
from_pandas_2_file(df=df.to_pandas(), tmp_filename=self.__filename, file_type=self.__file_type)
88+
from_json_2_file(filename=filename, tmp_filename=self.__filename, file_type=self.__file_type)
8789
return self
8890

89-
def to_polars(self) -> pl.DataFrame:
91+
def from_pandas(self, df: pd.DataFrame) -> "DataFlow.DataFrame":
9092
if self.__in_memory:
91-
return pl.from_pandas(self.__data.to_pandas())
93+
self.__data = fd.from_pandas(df)
9294
else:
93-
return pl.from_pandas(
94-
to_fireducks_from_file(tmp_filename=self.__filename, file_type=self.__file_type).to_pandas()
95-
)
95+
from_pandas_2_file(df=df, tmp_filename=self.__filename, file_type=self.__file_type)
96+
return self
9697

97-
def from_csv(self, filename: str):
98+
def from_parquet(self, filename: str) -> "DataFlow.DataFrame":
9899
if self.__in_memory:
99-
self.__data = fd.read_csv(filename)
100+
self.__data = fd.read_parquet(filename)
100101
else:
101-
from_csv_2_file(filename=filename, tmp_filename=self.__filename, file_type=self.__file_type)
102+
from_parquet_2_file(filename=filename, tmp_filename=self.__filename, file_type=self.__file_type)
102103
return self
103104

104-
def to_csv(self, filename: str, index=False):
105+
def from_polars(self, df: pl.DataFrame) -> "DataFlow.DataFrame":
105106
if self.__in_memory:
106-
self.__data.to_csv(filename, index=index)
107+
self.__data = fd.from_pandas(df.to_pandas())
107108
else:
108-
to_csv_from_file(filename=filename, tmp_filename=self.__filename, file_type=self.__file_type)
109+
from_pandas_2_file(df=df.to_pandas(), tmp_filename=self.__filename, file_type=self.__file_type)
109110
return self
110111

111-
def from_feather(self, filename: str):
112+
def to_csv(self, filename: str, index=False) -> "DataFlow.DataFrame":
112113
if self.__in_memory:
113-
self.__data = fd.from_pandas(feather.read_feather(filename))
114+
self.__data.to_csv(filename, index=index)
114115
else:
115-
from_feather_2_file(filename=filename, tmp_filename=self.__filename, file_type=self.__file_type)
116+
to_csv_from_file(filename=filename, tmp_filename=self.__filename, file_type=self.__file_type)
116117
return self
117118

118-
def to_feather(self, filename: str):
119+
def to_feather(self, filename: str) -> "DataFlow.DataFrame":
119120
if self.__in_memory:
120121
self.__data.to_feather(filename)
121122
else:
122123
to_feather_from_file(filename=filename, tmp_filename=self.__filename, file_type=self.__file_type)
123124
return self
124125

125-
def from_parquet(self, filename: str):
126+
def to_fireducks(self) -> fd.DataFrame:
126127
if self.__in_memory:
127-
self.__data = fd.read_parquet(filename)
128+
return self.__data
128129
else:
129-
from_parquet_2_file(filename=filename, tmp_filename=self.__filename, file_type=self.__file_type)
130-
return self
130+
return to_fireducks_from_file(tmp_filename=self.__filename, file_type=self.__file_type)
131131

132-
def to_parquet(self, filename: str):
132+
def to_hdf(self, filename: str, key: str = "key") -> "DataFlow.DataFrame":
133133
if self.__in_memory:
134-
self.__data.to_parquet(filename)
134+
self.__data.to_hdf(path_or_buf=filename, key=key)
135135
else:
136-
to_parquet_from_file(filename=filename, tmp_filename=self.__filename, file_type=self.__file_type)
136+
to_hdf_from_file(filename=filename, tmp_filename=self.__filename, file_type=self.__file_type, key=key)
137137
return self
138138

139-
def from_json(self, filename: str):
139+
def to_json(self, filename: str) -> "DataFlow.DataFrame":
140140
if self.__in_memory:
141-
self.__data = fd.read_json(filename)
141+
self.__data.to_json(filename)
142142
else:
143-
from_json_2_file(filename=filename, tmp_filename=self.__filename, file_type=self.__file_type)
143+
to_json_from_file(filename=filename, tmp_filename=self.__filename, file_type=self.__file_type)
144144
return self
145145

146-
def to_json(self, filename: str):
146+
def to_pandas(self) -> pd.DataFrame:
147147
if self.__in_memory:
148-
self.__data.to_json(filename)
148+
return self.__data.to_pandas()
149149
else:
150-
to_json_from_file(filename=filename, tmp_filename=self.__filename, file_type=self.__file_type)
151-
return self
150+
return to_fireducks_from_file(tmp_filename=self.__filename, file_type=self.__file_type).to_pandas()
152151

153-
def from_hdf(self, filename: str):
152+
def to_parquet(self, filename: str) -> "DataFlow.DataFrame":
154153
if self.__in_memory:
155-
self.__data = fd.read_hdf(filename)
154+
self.__data.to_parquet(filename)
156155
else:
157-
from_hdf_2_file(filename=filename, tmp_filename=self.__filename, file_type=self.__file_type)
156+
to_parquet_from_file(filename=filename, tmp_filename=self.__filename, file_type=self.__file_type)
158157
return self
159158

160-
def to_hdf(self, filename: str, key: str = "key"):
159+
def to_polars(self) -> pl.DataFrame:
161160
if self.__in_memory:
162-
self.__data.to_hdf(path_or_buf=filename, key=key)
161+
return pl.from_pandas(self.__data.to_pandas())
163162
else:
164-
to_hdf_from_file(filename=filename, tmp_filename=self.__filename, file_type=self.__file_type, key=key)
165-
return self
163+
return pl.from_pandas(
164+
to_fireducks_from_file(tmp_filename=self.__filename, file_type=self.__file_type).to_pandas()
165+
)
166166

167167
def columns(self) -> list:
168+
"""
169+
lists columns in data frame
170+
171+
:return: list - list of columns in data frame
172+
"""
168173
if self.__in_memory:
169174
return self.__data.columns.to_list()
170175
else:
171176
return data_get_columns(tmp_filename=self.__filename, file_type=self.__file_type)
172177

173-
def columns_delete(self, columns: list):
178+
def columns_delete(self, columns: list) -> "DataFlow.DataFrame":
179+
"""
180+
deletes columns from data frame
181+
182+
:param columns: list - list of columns to delete
183+
:return: self
184+
"""
174185
if self.__in_memory:
175186
self.__data.drop(columns=columns, inplace=True)
176187
else:
177188
data_delete_columns(tmp_filename=self.__filename, file_type=self.__file_type, columns=columns)
178189

179190
return self
180191

181-
def columns_rename(self, columns_mapping: dict):
192+
def columns_rename(self, columns_mapping: dict) -> "DataFlow.DataFrame":
193+
"""
194+
rename columns
195+
196+
:param columns_mapping: dict - old_name: new_name pairs ex. {"Year": "year", "Units": "units"}
197+
:return: self
198+
"""
182199
if self.__in_memory:
183200
self.__data.rename(columns=columns_mapping, inplace=True)
184201
else:
@@ -189,13 +206,28 @@ def columns_rename(self, columns_mapping: dict):
189206
)
190207
return self
191208

192-
def columns_select(self, columns: list):
209+
def columns_select(self, columns: list) -> "DataFlow.DataFrame":
210+
"""
211+
columns select - columns to keep in data frame
212+
213+
:param columns: list - list of columns to select
214+
:return: self
215+
"""
193216
if self.__in_memory:
194217
self.__data = self.__data[columns]
195218
else:
196219
data_select_columns(tmp_filename=self.__filename, file_type=self.__file_type, columns=columns)
220+
return self
197221

198-
def filter_on_column(self, column: str, value: Any, operator: Operator):
222+
def filter_on_column(self, column: str, value: Any, operator: Operator) -> "DataFlow.DataFrame":
223+
"""
224+
filters data on column
225+
226+
:param column: str - column name
227+
:param value: Any - value
228+
:param operator: mysiar_data_flow.lib.Operator - filter operator
229+
:return: self
230+
"""
199231
if self.__in_memory:
200232
match operator:
201233
case Operator.Eq:
@@ -218,3 +250,4 @@ def filter_on_column(self, column: str, value: Any, operator: Operator):
218250
value=value,
219251
operator=operator,
220252
)
253+
return self

Diff for: pyproject.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
license = {file = "LICENSE"}
33
[tool.poetry]
44
name = "mysiar-data-flow"
5-
version = "0.0.2rc1"
5+
version = "0.0.2"
66
readme = "README.md"
77
description = "Python data manipulation library"
88
authors = ["Piotr Synowiec <[email protected]>"]

Diff for: requirements.dev.txt

+2-1
Original file line numberDiff line numberDiff line change
@@ -4,4 +4,5 @@ pyproject-flake8
44
pytest
55
pytest-cov
66
poetry
7-
twine
7+
twine
8+
pdoc

0 commit comments

Comments
 (0)