@@ -53,132 +53,149 @@ def __del__(self):
53
53
if not self .__in_memory :
54
54
delete_file (self .__filename )
55
55
56
- def from_fireducks (self , df : fd .DataFrame ) :
56
+ def from_csv (self , filename : str ) -> "DataFlow .DataFrame" :
57
57
if self .__in_memory :
58
- self .__data = df
58
+ self .__data = fd . read_csv ( filename )
59
59
else :
60
- from_fireducks_2_file ( df = df , tmp_filename = self .__filename , file_type = self .__file_type )
60
+ from_csv_2_file ( filename = filename , tmp_filename = self .__filename , file_type = self .__file_type )
61
61
return self
62
62
63
- def to_fireducks (self ) -> fd .DataFrame :
63
+ def from_feather (self , filename : str ) -> "DataFlow .DataFrame" :
64
64
if self .__in_memory :
65
- return self .__data
65
+ self .__data = fd . from_pandas ( feather . read_feather ( filename ))
66
66
else :
67
- return to_fireducks_from_file (tmp_filename = self .__filename , file_type = self .__file_type )
67
+ from_feather_2_file (filename = filename , tmp_filename = self .__filename , file_type = self .__file_type )
68
+ return self
68
69
69
- def from_pandas (self , df : pd .DataFrame ):
70
+ def from_fireducks (self , df : fd .DataFrame ) -> "DataFlow.DataFrame" :
70
71
if self .__in_memory :
71
- self .__data = fd . from_pandas ( df )
72
+ self .__data = df
72
73
else :
73
- from_pandas_2_file (df = df , tmp_filename = self .__filename , file_type = self .__file_type )
74
+ from_fireducks_2_file (df = df , tmp_filename = self .__filename , file_type = self .__file_type )
74
75
return self
75
76
76
- def to_pandas (self ) -> pd .DataFrame :
77
+ def from_hdf (self , filename : str ) -> "DataFlow .DataFrame" :
77
78
if self .__in_memory :
78
- return self .__data . to_pandas ( )
79
+ self .__data = fd . read_hdf ( filename )
79
80
else :
80
- return to_fireducks_from_file (tmp_filename = self .__filename , file_type = self .__file_type ).to_pandas ()
81
+ from_hdf_2_file (filename = filename , tmp_filename = self .__filename , file_type = self .__file_type )
82
+ return self
81
83
82
- def from_polars (self , df : pl .DataFrame ) :
84
+ def from_json (self , filename : str ) -> "DataFlow .DataFrame" :
83
85
if self .__in_memory :
84
- self .__data = fd .from_pandas ( df . to_pandas () )
86
+ self .__data = fd .read_json ( filename )
85
87
else :
86
- from_pandas_2_file ( df = df . to_pandas () , tmp_filename = self .__filename , file_type = self .__file_type )
88
+ from_json_2_file ( filename = filename , tmp_filename = self .__filename , file_type = self .__file_type )
87
89
return self
88
90
89
- def to_polars (self ) -> pl .DataFrame :
91
+ def from_pandas (self , df : pd . DataFrame ) -> "DataFlow .DataFrame" :
90
92
if self .__in_memory :
91
- return pl . from_pandas ( self .__data . to_pandas () )
93
+ self .__data = fd . from_pandas ( df )
92
94
else :
93
- return pl .from_pandas (
94
- to_fireducks_from_file (tmp_filename = self .__filename , file_type = self .__file_type ).to_pandas ()
95
- )
95
+ from_pandas_2_file (df = df , tmp_filename = self .__filename , file_type = self .__file_type )
96
+ return self
96
97
97
- def from_csv (self , filename : str ):
98
+ def from_parquet (self , filename : str ) -> "DataFlow.DataFrame" :
98
99
if self .__in_memory :
99
- self .__data = fd .read_csv (filename )
100
+ self .__data = fd .read_parquet (filename )
100
101
else :
101
- from_csv_2_file (filename = filename , tmp_filename = self .__filename , file_type = self .__file_type )
102
+ from_parquet_2_file (filename = filename , tmp_filename = self .__filename , file_type = self .__file_type )
102
103
return self
103
104
104
- def to_csv (self , filename : str , index = False ) :
105
+ def from_polars (self , df : pl . DataFrame ) -> "DataFlow.DataFrame" :
105
106
if self .__in_memory :
106
- self .__data . to_csv ( filename , index = index )
107
+ self .__data = fd . from_pandas ( df . to_pandas () )
107
108
else :
108
- to_csv_from_file ( filename = filename , tmp_filename = self .__filename , file_type = self .__file_type )
109
+ from_pandas_2_file ( df = df . to_pandas () , tmp_filename = self .__filename , file_type = self .__file_type )
109
110
return self
110
111
111
- def from_feather (self , filename : str ) :
112
+ def to_csv (self , filename : str , index = False ) -> "DataFlow.DataFrame" :
112
113
if self .__in_memory :
113
- self .__data = fd . from_pandas ( feather . read_feather ( filename ) )
114
+ self .__data . to_csv ( filename , index = index )
114
115
else :
115
- from_feather_2_file (filename = filename , tmp_filename = self .__filename , file_type = self .__file_type )
116
+ to_csv_from_file (filename = filename , tmp_filename = self .__filename , file_type = self .__file_type )
116
117
return self
117
118
118
- def to_feather (self , filename : str ):
119
+ def to_feather (self , filename : str ) -> "DataFlow.DataFrame" :
119
120
if self .__in_memory :
120
121
self .__data .to_feather (filename )
121
122
else :
122
123
to_feather_from_file (filename = filename , tmp_filename = self .__filename , file_type = self .__file_type )
123
124
return self
124
125
125
- def from_parquet (self , filename : str ) :
126
+ def to_fireducks (self ) -> fd . DataFrame :
126
127
if self .__in_memory :
127
- self .__data = fd . read_parquet ( filename )
128
+ return self .__data
128
129
else :
129
- from_parquet_2_file (filename = filename , tmp_filename = self .__filename , file_type = self .__file_type )
130
- return self
130
+ return to_fireducks_from_file (tmp_filename = self .__filename , file_type = self .__file_type )
131
131
132
- def to_parquet (self , filename : str ) :
132
+ def to_hdf (self , filename : str , key : str = "key" ) -> "DataFlow.DataFrame" :
133
133
if self .__in_memory :
134
- self .__data .to_parquet ( filename )
134
+ self .__data .to_hdf ( path_or_buf = filename , key = key )
135
135
else :
136
- to_parquet_from_file (filename = filename , tmp_filename = self .__filename , file_type = self .__file_type )
136
+ to_hdf_from_file (filename = filename , tmp_filename = self .__filename , file_type = self .__file_type , key = key )
137
137
return self
138
138
139
- def from_json (self , filename : str ):
139
+ def to_json (self , filename : str ) -> "DataFlow.DataFrame" :
140
140
if self .__in_memory :
141
- self .__data = fd . read_json (filename )
141
+ self .__data . to_json (filename )
142
142
else :
143
- from_json_2_file (filename = filename , tmp_filename = self .__filename , file_type = self .__file_type )
143
+ to_json_from_file (filename = filename , tmp_filename = self .__filename , file_type = self .__file_type )
144
144
return self
145
145
146
- def to_json (self , filename : str ) :
146
+ def to_pandas (self ) -> pd . DataFrame :
147
147
if self .__in_memory :
148
- self .__data .to_json ( filename )
148
+ return self .__data .to_pandas ( )
149
149
else :
150
- to_json_from_file (filename = filename , tmp_filename = self .__filename , file_type = self .__file_type )
151
- return self
150
+ return to_fireducks_from_file (tmp_filename = self .__filename , file_type = self .__file_type ).to_pandas ()
152
151
153
- def from_hdf (self , filename : str ):
152
+ def to_parquet (self , filename : str ) -> "DataFlow.DataFrame" :
154
153
if self .__in_memory :
155
- self .__data = fd . read_hdf (filename )
154
+ self .__data . to_parquet (filename )
156
155
else :
157
- from_hdf_2_file (filename = filename , tmp_filename = self .__filename , file_type = self .__file_type )
156
+ to_parquet_from_file (filename = filename , tmp_filename = self .__filename , file_type = self .__file_type )
158
157
return self
159
158
160
- def to_hdf (self , filename : str , key : str = "key" ) :
159
+ def to_polars (self ) -> pl . DataFrame :
161
160
if self .__in_memory :
162
- self .__data .to_hdf ( path_or_buf = filename , key = key )
161
+ return pl . from_pandas ( self .__data .to_pandas () )
163
162
else :
164
- to_hdf_from_file (filename = filename , tmp_filename = self .__filename , file_type = self .__file_type , key = key )
165
- return self
163
+ return pl .from_pandas (
164
+ to_fireducks_from_file (tmp_filename = self .__filename , file_type = self .__file_type ).to_pandas ()
165
+ )
166
166
167
167
def columns (self ) -> list :
168
+ """
169
+ lists columns in data frame
170
+
171
+ :return: list - list of columns in data frame
172
+ """
168
173
if self .__in_memory :
169
174
return self .__data .columns .to_list ()
170
175
else :
171
176
return data_get_columns (tmp_filename = self .__filename , file_type = self .__file_type )
172
177
173
- def columns_delete (self , columns : list ):
178
+ def columns_delete (self , columns : list ) -> "DataFlow.DataFrame" :
179
+ """
180
+ deletes columns from data frame
181
+
182
+ :param columns: list - list of columns to delete
183
+ :return: self
184
+ """
174
185
if self .__in_memory :
175
186
self .__data .drop (columns = columns , inplace = True )
176
187
else :
177
188
data_delete_columns (tmp_filename = self .__filename , file_type = self .__file_type , columns = columns )
178
189
179
190
return self
180
191
181
- def columns_rename (self , columns_mapping : dict ):
192
+ def columns_rename (self , columns_mapping : dict ) -> "DataFlow.DataFrame" :
193
+ """
194
+ rename columns
195
+
196
+ :param columns_mapping: dict - old_name: new_name pairs ex. {"Year": "year", "Units": "units"}
197
+ :return: self
198
+ """
182
199
if self .__in_memory :
183
200
self .__data .rename (columns = columns_mapping , inplace = True )
184
201
else :
@@ -189,13 +206,28 @@ def columns_rename(self, columns_mapping: dict):
189
206
)
190
207
return self
191
208
192
- def columns_select (self , columns : list ):
209
+ def columns_select (self , columns : list ) -> "DataFlow.DataFrame" :
210
+ """
211
+ columns select - columns to keep in data frame
212
+
213
+ :param columns: list - list of columns to select
214
+ :return: self
215
+ """
193
216
if self .__in_memory :
194
217
self .__data = self .__data [columns ]
195
218
else :
196
219
data_select_columns (tmp_filename = self .__filename , file_type = self .__file_type , columns = columns )
220
+ return self
197
221
198
- def filter_on_column (self , column : str , value : Any , operator : Operator ):
222
+ def filter_on_column (self , column : str , value : Any , operator : Operator ) -> "DataFlow.DataFrame" :
223
+ """
224
+ filters data on column
225
+
226
+ :param column: str - column name
227
+ :param value: Any - value
228
+ :param operator: mysiar_data_flow.lib.Operator - filter operator
229
+ :return: self
230
+ """
199
231
if self .__in_memory :
200
232
match operator :
201
233
case Operator .Eq :
@@ -218,3 +250,4 @@ def filter_on_column(self, column: str, value: Any, operator: Operator):
218
250
value = value ,
219
251
operator = operator ,
220
252
)
253
+ return self
0 commit comments