commit-live-students · Tusharsharma86 · Oct 21, 2018 · Oct 21, 2018 · Oct 21, 2018 · Oct 21, 2018
diff --git a/__pycache__/__init__.cpython-36.pyc b/__pycache__/__init__.cpython-36.pyc
diff --git a/q01_outlier_removal/__pycache__/__init__.cpython-36.pyc b/q01_outlier_removal/__pycache__/__init__.cpython-36.pyc
diff --git a/q01_outlier_removal/__pycache__/build.cpython-36.pyc b/q01_outlier_removal/__pycache__/build.cpython-36.pyc
diff --git a/q01_outlier_removal/build.py b/q01_outlier_removal/build.py
@@ -1,8 +1,28 @@
+# %load q01_outlier_removal/build.py
 # Default imports
 import pandas as pd
 
 loan_data = pd.read_csv('data/loan_prediction_uncleaned.csv')
 loan_data = loan_data.drop('Loan_ID', 1)
 
+# Function will remove the outliers
+def outlier_removal(loan_data):
+    a = loan_data['ApplicantIncome'].quantile(0.95) # It comes out to be 14583.0
+    b = loan_data['CoapplicantIncome'].quantile(0.95) # It comes out to be 4997.4
+    c = loan_data['LoanAmount'].quantile(0.95) # It comes out to be 297.8
+    loan_data = loan_data.drop(loan_data[loan_data['ApplicantIncome'] > a].index)
+    loan_data = loan_data.drop(loan_data[loan_data['CoapplicantIncome'] > b].index)
+    loan_data = loan_data.drop(loan_data[loan_data['LoanAmount'] > c].index)
+    return loan_data
+
+outlier_removal(loan_data).shape
+
+
+
+
+
+
+
+
+
 
-# Write your Solution here:
diff --git a/q01_outlier_removal/tests/__pycache__/__init__.cpython-36.pyc b/q01_outlier_removal/tests/__pycache__/__init__.cpython-36.pyc
diff --git a/q01_outlier_removal/tests/__pycache__/test_q01_outlier_removal.cpython-36.pyc b/q01_outlier_removal/tests/__pycache__/test_q01_outlier_removal.cpython-36.pyc
diff --git a/q02_data_cleaning_all/__pycache__/__init__.cpython-36.pyc b/q02_data_cleaning_all/__pycache__/__init__.cpython-36.pyc
diff --git a/q02_data_cleaning_all/__pycache__/build.cpython-36.pyc b/q02_data_cleaning_all/__pycache__/build.cpython-36.pyc
diff --git a/q02_data_cleaning_all/build.py b/q02_data_cleaning_all/build.py
@@ -1,9 +1,11 @@
+# %load q02_data_cleaning_all/build.py
 # Default Imports
 import sys, os
 sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname('__file__'))))
 import pandas as pd
 import numpy as np
 from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import Imputer
 from greyatomlib.logistic_regression_project.q01_outlier_removal.build import outlier_removal
 
 loan_data = pd.read_csv('data/loan_prediction_uncleaned.csv')
@@ -12,3 +14,24 @@
 
 
 # Write your solution here :
+def data_cleaning(loan_data):
+    #Impute the values with mean and mode 
+    loan_data['LoanAmount'].fillna(loan_data['LoanAmount'].mean(), inplace = True)
+    cat_col = ['Gender', 'Married', 'Dependents', 'Self_Employed', 'Loan_Amount_Term', 'Credit_History']
+    for col in cat_col:
+        loan_data['LoanAmount'].fillna(loan_data[col].mode(), inplace = True)
+
+    #seperate the features and target variable
+    X = loan_data.iloc[:,:-1]
+    y = loan_data.iloc[:,-1]
+
+    #train test split for ML
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 9)
+    return X, y, X_train, X_test, y_train, y_test
+
+
+
+
+
+
+
diff --git a/q02_data_cleaning_all/tests/__pycache__/__init__.cpython-36.pyc b/q02_data_cleaning_all/tests/__pycache__/__init__.cpython-36.pyc
diff --git a/q02_data_cleaning_all/tests/__pycache__/test_q02_data_cleaning.cpython-36.pyc b/q02_data_cleaning_all/tests/__pycache__/test_q02_data_cleaning.cpython-36.pyc
diff --git a/q02_data_cleaning_all_2/__pycache__/__init__.cpython-36.pyc b/q02_data_cleaning_all_2/__pycache__/__init__.cpython-36.pyc
diff --git a/q02_data_cleaning_all_2/__pycache__/build.cpython-36.pyc b/q02_data_cleaning_all_2/__pycache__/build.cpython-36.pyc
diff --git a/q02_data_cleaning_all_2/build.py b/q02_data_cleaning_all_2/build.py
@@ -1,3 +1,4 @@
+# %load q02_data_cleaning_all_2/build.py
 # Default Imports
 import pandas as pd
 import numpy as np
@@ -11,3 +12,38 @@
 
 
 # Write your solution here :
+def data_cleaning_2(X_train, X_test, y_train, y_test):
+    cat_col = (X_train.select_dtypes(include=['object']).columns)
+    num_col = ['ApplicantIncome','CoapplicantIncome','LoanAmount']
+
+    X_train['ApplicantIncome_sqrt'] = np.sqrt(X_train['ApplicantIncome'] )
+    X_test['ApplicantIncome_sqrt'] = np.sqrt(X_test['ApplicantIncome'] )
+    X_train['CoapplicantIncome_sqrt'] = np.sqrt(X_train['CoapplicantIncome'] )
+    X_test['CoapplicantIncome_sqrt'] = np.sqrt(X_test['CoapplicantIncome'] )
+    X_train['LoanAmount_sqrt'] = np.sqrt(X_train['LoanAmount'] )
+    X_test['LoanAmount_sqrt'] = np.sqrt(X_test['LoanAmount'] )
+
+    df_cat_train = pd.get_dummies(X_train[cat_col],drop_first=True)
+    df_cat_test = pd.get_dummies(X_test[cat_col],drop_first=True)
+
+    X_train = pd.concat([X_train,df_cat_train],axis =1)
+    X_test = pd.concat([X_test,df_cat_test],axis =1)
+
+    drop_col = list(cat_col) + num_col
+    X_train.drop(labels=drop_col,axis=1,inplace=True)
+    X_test.drop(labels=drop_col,axis=1,inplace=True)
+
+    return X_train, X_test, y_train, y_test
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/q02_data_cleaning_all_2/tests/__pycache__/__init__.cpython-36.pyc b/q02_data_cleaning_all_2/tests/__pycache__/__init__.cpython-36.pyc
diff --git a/q02_data_cleaning_all_2/tests/__pycache__/q02_test_data_cleaning_2.cpython-36.pyc b/q02_data_cleaning_all_2/tests/__pycache__/q02_test_data_cleaning_2.cpython-36.pyc
diff --git a/q03_logistic_regression/__pycache__/__init__.cpython-36.pyc b/q03_logistic_regression/__pycache__/__init__.cpython-36.pyc
diff --git a/q03_logistic_regression/__pycache__/build.cpython-36.pyc b/q03_logistic_regression/__pycache__/build.cpython-36.pyc
diff --git a/q03_logistic_regression/build.py b/q03_logistic_regression/build.py
@@ -1,12 +1,16 @@
+# %load q03_logistic_regression/build.py
 # Default Imports
 import pandas as pd
 from sklearn.preprocessing import StandardScaler
 from sklearn.linear_model import LogisticRegression
+import matplotlib.pyplot as plt
 from sklearn.metrics import confusion_matrix
 from greyatomlib.logistic_regression_project.q01_outlier_removal.build import outlier_removal
 from greyatomlib.logistic_regression_project.q02_data_cleaning_all.build import data_cleaning
 from greyatomlib.logistic_regression_project.q02_data_cleaning_all_2.build import data_cleaning_2
 
+
+
 loan_data = pd.read_csv('data/loan_prediction_uncleaned.csv')
 loan_data = loan_data.drop('Loan_ID', 1)
 loan_data = outlier_removal(loan_data)
@@ -15,4 +19,35 @@
 
 
 # Write your solution code here:
+def logistic_regression(X_train, X_test, y_train, y_test):
+    std_scl = StandardScaler()
+    scale_df = std_scl.fit_transform(X=X_train[['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount']])
+    scale_df = pd.DataFrame(scale_df,columns=['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount'] , index=X_train.index)
+
+    X_train['ApplicantIncome'] = scale_df['ApplicantIncome']
+    X_train['CoapplicantIncome'] = scale_df['CoapplicantIncome']
+    X_train['LoanAmount'] = scale_df['LoanAmount']
+
+    std_scl2 = StandardScaler()
+    scale_df2 = std_scl2.fit_transform(X=X_test[['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount']])
+    scale_df2 = pd.DataFrame(scale_df2,columns=['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount'] , index=X_test.index)
+
+    X_test['ApplicantIncome'] = scale_df2['ApplicantIncome']
+    X_test['CoapplicantIncome'] = scale_df2['CoapplicantIncome']
+    X_test['LoanAmount'] = scale_df2['LoanAmount']
+
+    model = LogisticRegression(random_state=9)
+    model.fit(X_train,y_train)
+
+    y_pred = model.predict(X_test)
+
+    cm = confusion_matrix(y_test,y_pred)
+    return cm
+
+
+
+
+
+
+
 
diff --git a/q03_logistic_regression/tests/__pycache__/__init__.cpython-36.pyc b/q03_logistic_regression/tests/__pycache__/__init__.cpython-36.pyc
diff --git a/q03_logistic_regression/tests/__pycache__/test_q03_logistic_regression.cpython-36.pyc b/q03_logistic_regression/tests/__pycache__/test_q03_logistic_regression.cpython-36.pyc