-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmy_library.py
190 lines (164 loc) · 6.95 KB
/
my_library.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
def data_frame_sorter_by_value(df_data,
list_of_columns_to_groupby,
column_value,
ascending_order=False,
additional_column_to_sort_on=[],
nan_position="first"):
import pandas as pd
import numpy as np
df=df_data.copy()
df=pd.merge(df,
df.groupby(list_of_columns_to_groupby)[column_value].sum().reset_index(name="sort_on"),
left_on= list_of_columns_to_groupby,
right_on = list_of_columns_to_groupby,
how="left")
df = df.sort_values(["sort_on"]+list_of_columns_to_groupby+additional_column_to_sort_on,
ascending=ascending_order,
na_position=nan_position).reset_index(drop=True)
df.drop("sort_on",axis=1,inplace=True)
return(df)
def rolling_window_slicer(series,window):
"""Slice a series into rolling window bucket"""
import numpy as np
list_of_slices=[]
for i in range(0,len(series)-window+1):
list_of_slices.append(series[i:window])
window=window+1
return(np.array(list_of_slices))
def data_frame_flattener(df_data):
df=df_data.copy()
try:
df.columns=[' '.join(map(str,col)).strip() for col in df.columns.values]
except:
pass
return(df)
def column_suffix_adder(df_data,
list_of_columns_to_add_suffix_on,
suffix):
"""Add specific siffix to specific columns"""
df=df_data.copy()
### Add suffix or prefix to certain columns rename all columns
new_names = [(i,i+suffix) for i in df[list_of_columns_to_add_suffix_on].columns.values]
df.rename(columns = dict(new_names), inplace=True)
return(df)
def column_prefix_adder(df_data,
list_of_columns_to_add_prefix_on,
prefix):
"""Add specific prefix to specific columns"""
df=df_data.copy()
### Add prefix prefix to certain columns rename all columns
new_names = [(i,prefix+i) for i in df[list_of_columns_to_add_prefix_on].columns.values]
df.rename(columns = dict(new_names), inplace=True)
return(df)
def pivot_table_with_subgroup(df_data,
pivot_index,
pivot_columns,
pivot_values,
pivot_func):
"""Generate Pivot tabel with subggroup of the index same as what excel does"""
import pandas as pd
df_table = pd.DataFrame()
for i in range(len(pivot_index)):
df_pivot1=pd.pivot_table(data=df_data,
index=pivot_index[:-i+len(pivot_index)],
columns=pivot_columns,
values=pivot_values,
aggfunc=pivot_func,
fill_value=0).reset_index()
df_pivot1= data_frame_flattener(df_pivot1)
### Append Generated dataframes on top of each other
df_table=df_table.append(df_pivot1,
sort=False,
ignore_index=True)
### Sort values
df_table.sort_values(pivot_index,
na_position="first",
inplace=True)
return(df_table)
def weighted_avg(group, value_variable, weight_variable):
""" Compute Weighted Average of a variable given weight variable. Can be used on Groupby.
Example: df_data.groupby(["Manager","State","Product"]).apply(wavg, "Unit Price", "Quantity")
https://pbpython.com/weighted-average.html
http://stackoverflow.com/questions/10951341/pandas-dataframe-aggregate-function-using-multiple-columns
In rare instance, we may not have weights, so just return the mean. Customize this if your business case
should return otherwise.
"""
d = group[value_variable]
w = group[weight_variable]
try:
return (d * w).sum() / w.sum()
except ZeroDivisionError:
return d.mean()
def dataframe_col_widths_returner(df):
import numpy as np
measurer = np.vectorize(len)
return ((measurer(df.values.astype(str)).max(axis=0)))
def data_frame_to_excel_savor(list_of_data_frame,
list_of_sheet_name,
path_filename,
tuple_freez_pans=(1,1),
float_format="%.4f"):
"""Saving dateframes into excelDataFrame my be flat."""
import pandas as pd
import numpy as np
data_frame_for_writer= list_of_data_frame
sheets_in_writer= list_of_sheet_name
###save File as Excel
writer = pd.ExcelWriter(path_filename)
for df,sheet in zip(data_frame_for_writer,
sheets_in_writer):
df.to_excel(writer,
sheet,
index=False,
float_format=float_format)
### Assign WorkBook
workbook=writer.book
# Add a header format
header_format = workbook.add_format({'bold': True,
'text_wrap': True,
'size':10,
'valign': 'top',
'fg_color': '#c7e7ff',
'border': 1})
### Apply same format on each sheet being saved
for df,sheet in zip(data_frame_for_writer,
sheets_in_writer):
for col_num, value in enumerate(df.columns.values):
writer.sheets[sheet].write(0, col_num, value, header_format)
writer.sheets[sheet].autofilter(0,0,0,df.shape[1]-1)
writer.sheets[sheet].freeze_panes(tuple_freez_pans[0],tuple_freez_pans[1])
### If the width of the column is less than 4 , assign 7
widths=dataframe_col_widths_returner(df)
widths=np.where(widths > 5,widths,6.8)
for i, width in enumerate(widths):
writer.sheets[sheet].set_column(i, i, width)
writer.save()
def multivariate_t_rvs(m, S, df, n=1):
#written by Enzo Michelangeli, style changes by josef-pktd
# Student's T random variable
'''generate random variables of multivariate t distribution
Parameters
----------
m : array_like
mean of random variable, length determines dimension of random variable
S : array_like
square array of covariance matrix
df : int or float
degrees of freedom
n : int
number of observations, return random array will be (n, len(m))
Returns
-------
rvs : ndarray, (n, len(m))
each row is an independent draw of a multivariate t distributed
random variable
'''
import numpy as np
m = np.asarray(m)
d = len(m)
if df == np.inf:
x = 1.
else:
x = np.random.chisquare(df, n)/df
z = np.random.multivariate_normal(np.zeros(d),S,(n,))
return m + z/np.sqrt(x)[:,None] # same output format as random.multivariate_normal