-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathclass_missing_values.py
More file actions
83 lines (52 loc) · 2.72 KB
/
class_missing_values.py
File metadata and controls
83 lines (52 loc) · 2.72 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
# ==============================
# MCAR adhoc tests vs MNAR, MAR
# ==============================
# ======
# Plots
# ======
#import ED
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from scipy.stats import chi2_contingency
from sklearn.preprocessing import OrdinalEncoder
import numpy as np
import logging
from pd_download import data_cleaning
# --------------------------------------------------------Class Imputation----------------------------------------------------
class ImputationCat:
def __init__(self, df_cat):
self.df_cat = df_cat
def simple_imputer_mode(self):
""" Simple Imputation -- through Python API's """
df_cat_mode = self.df_cat.copy(True)
mode_imputer = SimpleImputer(strategy="most_frequent")
df_cat_mode.iloc[:,:] = mode_imputer.fit_transform(df_cat_mode)
return df_cat_mode
def KNN_Imputation(self):
""" KNN imputation """
dataframe_array = df_cat.to_numpy().astype(float)
dataframe_impute_KNN = impy.fast_knn(dataframe_array)
return pd.DataFrame(dataframe_impute_KNN)
def _ordinal_encode_nan(self, independent_series, dataframe): # for one column, then procedural
'''Ordinal Encoding with missing values'''
y = OrdinalEncoder() # instatiate ordinal encoder class
name = independent_series # pass in the independent series for a missing column, (name = name of column)
name_not_null = independent_series[independent_series.notnull()] # removes null values from column
reshaped_vals = name_not_null.values.reshape(-1,1) # extract series values only and reshape them for
encoded_vals = y.fit_transform(reshaped_vals) # function takes in array
dataframe.loc[independent_series.notnull(), independent_series.name] = np.squeeze(encoded_vals)
return dataframe
def concatenate_total_df(self, dataframefloat, dataframecategorical):
""" oncatenate the imputed dataframes(categorical/float)
into one total dataframe for further analysis """
df_total_no_missing = pd.concat([dataframefloat, dataframecategorical], axis = 1)
return df_total_no_missing
# -----------------------------------------------------------Testing-----------------------------------------------------------
if __name__ == "__main__":
file_path = "./KGB.sas7bdat"
data_types, df_loan_categorical, df_loan_float = data_cleaning(file_path)
miss = ImputationCat(df_loan_categorical)
#miss.concatenate_total_df(dataframefloat, dataframecategorical)
y = miss.simple_imputer_mode()
#print(y)