streamlit/class_missing_values.py at main · Humbulani1234/streamlit · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83


# ==============================
# MCAR adhoc tests vs MNAR, MAR
# ==============================

# ======
# Plots
# ======

#import ED
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from scipy.stats import chi2_contingency
from sklearn.preprocessing import OrdinalEncoder
import numpy as np
import logging

from pd_download import data_cleaning

# --------------------------------------------------------Class Imputation----------------------------------------------------

class ImputationCat:

    def __init__(self, df_cat):

        self.df_cat = df_cat

    def simple_imputer_mode(self):

        """ Simple Imputation -- through Python API's """

        df_cat_mode = self.df_cat.copy(True)
        mode_imputer = SimpleImputer(strategy="most_frequent")
        df_cat_mode.iloc[:,:] = mode_imputer.fit_transform(df_cat_mode)

        return df_cat_mode

    def KNN_Imputation(self):

        """ KNN imputation """

        dataframe_array = df_cat.to_numpy().astype(float)
        dataframe_impute_KNN = impy.fast_knn(dataframe_array)

        return pd.DataFrame(dataframe_impute_KNN)

    def _ordinal_encode_nan(self, independent_series, dataframe): # for one column, then procedural

        '''Ordinal Encoding with missing values'''

        y = OrdinalEncoder()                  # instatiate ordinal encoder class
        name = independent_series             # pass in the independent series for a missing column, (name = name of column)

        name_not_null = independent_series[independent_series.notnull()]    # removes null values from column

        reshaped_vals = name_not_null.values.reshape(-1,1)               # extract series values only and reshape them for
        encoded_vals = y.fit_transform(reshaped_vals)                     # function takes in array

        dataframe.loc[independent_series.notnull(), independent_series.name] = np.squeeze(encoded_vals)

        return dataframe

    def concatenate_total_df(self, dataframefloat, dataframecategorical):

        """ oncatenate the imputed dataframes(categorical/float)
         into one total dataframe for further analysis """

        df_total_no_missing = pd.concat([dataframefloat, dataframecategorical], axis = 1)

        return df_total_no_missing

# -----------------------------------------------------------Testing-----------------------------------------------------------

if __name__ == "__main__":

    file_path = "./KGB.sas7bdat"
    data_types, df_loan_categorical, df_loan_float = data_cleaning(file_path)
    miss = ImputationCat(df_loan_categorical)
    #miss.concatenate_total_df(dataframefloat, dataframecategorical)
    y = miss.simple_imputer_mode()
    #print(y)