diff --git a/__pycache__/__init__.cpython-36.pyc b/__pycache__/__init__.cpython-36.pyc index 2e5da7d..0ed8457 100644 Binary files a/__pycache__/__init__.cpython-36.pyc and b/__pycache__/__init__.cpython-36.pyc differ diff --git a/q01_outlier_removal/__pycache__/__init__.cpython-36.pyc b/q01_outlier_removal/__pycache__/__init__.cpython-36.pyc index 2f9a42a..84ab77f 100644 Binary files a/q01_outlier_removal/__pycache__/__init__.cpython-36.pyc and b/q01_outlier_removal/__pycache__/__init__.cpython-36.pyc differ diff --git a/q01_outlier_removal/__pycache__/build.cpython-36.pyc b/q01_outlier_removal/__pycache__/build.cpython-36.pyc index 8248a16..945754f 100644 Binary files a/q01_outlier_removal/__pycache__/build.cpython-36.pyc and b/q01_outlier_removal/__pycache__/build.cpython-36.pyc differ diff --git a/q01_outlier_removal/build.py b/q01_outlier_removal/build.py index ec278ba..e6f0d22 100644 --- a/q01_outlier_removal/build.py +++ b/q01_outlier_removal/build.py @@ -1,3 +1,4 @@ +# %load q01_outlier_removal/build.py # Default imports import pandas as pd @@ -6,3 +7,11 @@ # Write your Solution here: +def outlier_removal(loan_data): + loan = loan_data[['ApplicantIncome','CoapplicantIncome','LoanAmount']] + q = loan_data.quantile(0.95) + for l in loan: + loan_data = loan_data.drop(loan_data[loan_data[l] > q[l]].index) + return loan_data +outlier_removal(loan_data).shape + diff --git a/q01_outlier_removal/tests/__pycache__/__init__.cpython-36.pyc b/q01_outlier_removal/tests/__pycache__/__init__.cpython-36.pyc index 5a057ff..c45ab8d 100644 Binary files a/q01_outlier_removal/tests/__pycache__/__init__.cpython-36.pyc and b/q01_outlier_removal/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q01_outlier_removal/tests/__pycache__/test_q01_outlier_removal.cpython-36.pyc b/q01_outlier_removal/tests/__pycache__/test_q01_outlier_removal.cpython-36.pyc index 4c0b6c7..be1452c 100644 Binary files a/q01_outlier_removal/tests/__pycache__/test_q01_outlier_removal.cpython-36.pyc and b/q01_outlier_removal/tests/__pycache__/test_q01_outlier_removal.cpython-36.pyc differ diff --git a/q02_data_cleaning_all/__pycache__/__init__.cpython-36.pyc b/q02_data_cleaning_all/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..a281e5e Binary files /dev/null and b/q02_data_cleaning_all/__pycache__/__init__.cpython-36.pyc differ diff --git a/q02_data_cleaning_all/__pycache__/build.cpython-36.pyc b/q02_data_cleaning_all/__pycache__/build.cpython-36.pyc new file mode 100644 index 0000000..e69274d Binary files /dev/null and b/q02_data_cleaning_all/__pycache__/build.cpython-36.pyc differ diff --git a/q02_data_cleaning_all/build.py b/q02_data_cleaning_all/build.py index b56e2bc..5b96382 100644 --- a/q02_data_cleaning_all/build.py +++ b/q02_data_cleaning_all/build.py @@ -1,14 +1,39 @@ +# %load q02_data_cleaning_all/build.py # Default Imports import sys, os sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname('__file__')))) import pandas as pd import numpy as np from sklearn.model_selection import train_test_split +from sklearn.preprocessing import Imputer from greyatomlib.logistic_regression_project.q01_outlier_removal.build import outlier_removal loan_data = pd.read_csv('data/loan_prediction_uncleaned.csv') loan_data = loan_data.drop('Loan_ID', 1) loan_data = outlier_removal(loan_data) - +np.random.seed(9) # Write your solution here : +def data_cleaning(loan_data): + #Computing mean of the numerical variable + impu_mean = Imputer(missing_values = 'NaN', strategy='mean') + impu_mean.fit(loan_data[['LoanAmount']]) + loan_data[['LoanAmount']] = impu_mean.transform(loan_data[['LoanAmount']]) + + #Computing mode of the categorical variable + cate_col = ['Gender', 'Married', 'Dependents', 'Self_Employed', 'Loan_Amount_Term', 'Credit_History'] + for col in cate_col: + loan_data['LoanAmount'].fillna(loan_data[col].mode(), inplace = True) + + #seperate the features and target variable + X = loan_data.iloc[:,:-1] + y = loan_data.iloc[:,-1] + + #train test split + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 9) + return X, y, X_train, X_test, y_train, y_test +data_cleaning(loan_data) + + + + diff --git a/q02_data_cleaning_all/tests/__pycache__/__init__.cpython-36.pyc b/q02_data_cleaning_all/tests/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..bbe9834 Binary files /dev/null and b/q02_data_cleaning_all/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q02_data_cleaning_all/tests/__pycache__/test_q02_data_cleaning.cpython-36.pyc b/q02_data_cleaning_all/tests/__pycache__/test_q02_data_cleaning.cpython-36.pyc new file mode 100644 index 0000000..fa680c1 Binary files /dev/null and b/q02_data_cleaning_all/tests/__pycache__/test_q02_data_cleaning.cpython-36.pyc differ diff --git a/q02_data_cleaning_all_2/__pycache__/__init__.cpython-36.pyc b/q02_data_cleaning_all_2/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..b384174 Binary files /dev/null and b/q02_data_cleaning_all_2/__pycache__/__init__.cpython-36.pyc differ diff --git a/q02_data_cleaning_all_2/__pycache__/build.cpython-36.pyc b/q02_data_cleaning_all_2/__pycache__/build.cpython-36.pyc new file mode 100644 index 0000000..3f7a332 Binary files /dev/null and b/q02_data_cleaning_all_2/__pycache__/build.cpython-36.pyc differ diff --git a/q02_data_cleaning_all_2/build.py b/q02_data_cleaning_all_2/build.py index e20ff7b..6b9fd79 100644 --- a/q02_data_cleaning_all_2/build.py +++ b/q02_data_cleaning_all_2/build.py @@ -1,3 +1,4 @@ +# %load q02_data_cleaning_all_2/build.py # Default Imports import pandas as pd import numpy as np @@ -11,3 +12,13 @@ # Write your solution here : +def data_cleaning_2(X_train,X_test,y_train,y_test): + cols=X_train.select_dtypes(include=['object']).columns.tolist() + X_train['ApplicantIncome'] = np.sqrt(X_train['ApplicantIncome']) + X_test['ApplicantIncome']= np.sqrt(X_test['ApplicantIncome']) + X_train=pd.get_dummies(X_train,columns=cols,drop_first=True) + X_test=pd.get_dummies(X_test,columns=cols,drop_first=True) + return X_train,X_test,y_train,y_test +data_cleaning_2(X_train,X_test,y_train,y_test) + + diff --git a/q02_data_cleaning_all_2/tests/__pycache__/__init__.cpython-36.pyc b/q02_data_cleaning_all_2/tests/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..f291273 Binary files /dev/null and b/q02_data_cleaning_all_2/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q02_data_cleaning_all_2/tests/__pycache__/q02_test_data_cleaning_2.cpython-36.pyc b/q02_data_cleaning_all_2/tests/__pycache__/q02_test_data_cleaning_2.cpython-36.pyc new file mode 100644 index 0000000..f0805cb Binary files /dev/null and b/q02_data_cleaning_all_2/tests/__pycache__/q02_test_data_cleaning_2.cpython-36.pyc differ diff --git a/q03_logistic_regression/__pycache__/__init__.cpython-36.pyc b/q03_logistic_regression/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..fec288a Binary files /dev/null and b/q03_logistic_regression/__pycache__/__init__.cpython-36.pyc differ diff --git a/q03_logistic_regression/__pycache__/build.cpython-36.pyc b/q03_logistic_regression/__pycache__/build.cpython-36.pyc new file mode 100644 index 0000000..45420de Binary files /dev/null and b/q03_logistic_regression/__pycache__/build.cpython-36.pyc differ diff --git a/q03_logistic_regression/build.py b/q03_logistic_regression/build.py index cdbd506..bdc6f96 100644 --- a/q03_logistic_regression/build.py +++ b/q03_logistic_regression/build.py @@ -1,3 +1,4 @@ +# %load q03_logistic_regression/build.py # Default Imports import pandas as pd from sklearn.preprocessing import StandardScaler @@ -15,4 +16,15 @@ # Write your solution code here: +def logistic_regression(X_train,X_test,y_train,y_test): + scalar=StandardScaler() + scalar.fit(X_train[['ApplicantIncome','CoapplicantIncome','LoanAmount']]) + modl = LogisticRegression(random_state=9) + modl.fit(X_train,y_train) + y_pred=modl.predict(X_test) + cm = confusion_matrix(y_test,y_pred) + return cm +logistic_regression(X_train,X_test,y_train,y_test) + + diff --git a/q03_logistic_regression/tests/__pycache__/__init__.cpython-36.pyc b/q03_logistic_regression/tests/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..4d959ff Binary files /dev/null and b/q03_logistic_regression/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q03_logistic_regression/tests/__pycache__/test_q03_logistic_regression.cpython-36.pyc b/q03_logistic_regression/tests/__pycache__/test_q03_logistic_regression.cpython-36.pyc new file mode 100644 index 0000000..58d6b77 Binary files /dev/null and b/q03_logistic_regression/tests/__pycache__/test_q03_logistic_regression.cpython-36.pyc differ