diff --git a/5 - synthetic-data-applications/fairness/Fairness.ipynb b/5 - synthetic-data-applications/fairness/Fairness.ipynb new file mode 100644 index 0000000..b8957e2 --- /dev/null +++ b/5 - synthetic-data-applications/fairness/Fairness.ipynb @@ -0,0 +1,218 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "diverse-creek", + "metadata": {}, + "outputs": [], + "source": [ + "# import generic packages\n", + "import pandas as pd\n", + "import numpy as np\n", + "from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier\n", + "from sklearn.tree import DecisionTreeClassifier\n", + "from sklearn.linear_model import SGDClassifier \n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.preprocessing import LabelEncoder\n", + "import seaborn as sns\n", + "from sklearn.metrics import confusion_matrix, roc_auc_score, accuracy_score\n", + "import matplotlib.pyplot as plt\n", + "import fairlens as fl" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "hollywood-system", + "metadata": {}, + "outputs": [], + "source": [ + "# Import ydata modules\n", + "from ydata.synthesizers.regular.model import BaseModel, RegularSynthesizer\n", + "from ydata.metadata import Metadata\n", + "from ydata.dataset.dataset import Dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "intensive-bunch", + "metadata": {}, + "outputs": [], + "source": [ + "# load loans data (available at kaggle https://www.kaggle.com/code/ajaymanwani/loan-approval-prediction/data)\n", + "data = pd.read_csv('~/Downloads/loans.csv').fillna(0)\n", + "data.shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "sunset-province", + "metadata": {}, + "outputs": [], + "source": [ + "# check that the data in unbalanced for Married status\n", + "sns.countplot(x='Married',data=data, palette = 'Set2') " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "great-effects", + "metadata": {}, + "outputs": [], + "source": [ + "# convert categorical to numerical\n", + "from sklearn.preprocessing import LabelEncoder\n", + "for c in data.columns:\n", + " if data[c].dtype =='O':\n", + " le = LabelEncoder()\n", + " data[c] = le.fit_transform(data[c].astype(str)).astype('int')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "steady-petersburg", + "metadata": {}, + "outputs": [], + "source": [ + "# train a synthetizer\n", + "synthetizer = RegularSynthesizer()\n", + "original = Dataset(data)\n", + "metadata = Metadata()\n", + "synthetizer.fit(original)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "consistent-nursery", + "metadata": {}, + "outputs": [], + "source": [ + "#generate new data\n", + "synth = synthetizer.sample(n_samples=900)\n", + "s = synth.to_pandas()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "featured-seventh", + "metadata": {}, + "outputs": [], + "source": [ + "#create a balanced dataset\n", + "data_balanced = pd.concat([data,s[s.Married==1]],axis=0)\n", + "data_balanced.shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "stock-schema", + "metadata": {}, + "outputs": [], + "source": [ + "# check that new data is balanced\n", + "sns.countplot(x='Married',data=data_balanced, palette = 'Set2') " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "economic-lunch", + "metadata": {}, + "outputs": [], + "source": [ + "# create training a test data\n", + "xtrainf = []\n", + "ytrainf = []\n", + "xtrain, xtest, ytrain, ytest = train_test_split(data.drop(columns='Loan_Status'), data['Loan_Status'], test_size=.2, random_state = 100)\n", + "xtrainf.append(xtrain)\n", + "ytrainf.append(ytrain)\n", + "\n", + "xtrain, _, ytrain, _ = train_test_split(data_balanced.drop(columns='Loan_Status'), data_balanced['Loan_Status'], test_size=.2, random_state = 100)\n", + "ii = [i for i in xtrain.index if i not in xtest.index]\n", + "xtrainf.append(xtrain.loc[ii])\n", + "ytrainf.append(ytrain.loc[ii])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "portable-courage", + "metadata": {}, + "outputs": [], + "source": [ + "# check accuracy of models trained in both data sets\n", + "models = [RandomForestClassifier(),DecisionTreeClassifier(), AdaBoostClassifier()]\n", + "married = [1,2]\n", + "improvement = {}\n", + "\n", + "for m in models:\n", + " improvement[m] = {}\n", + " for i in range(0,2):\n", + " m.fit(xtrainf[i],ytrainf[i])\n", + " p = m.predict(xtest)\n", + "\n", + " improvement[m][i] = np.round(accuracy_score(p,ytest),2)\n", + " print('Overall Accuracy %f'% (improvement[m][i]))\n", + "\n", + " for g in married:\n", + " p = m.predict(xtest[xtest.Married==g])\n", + " print('model %s accuracy, married_status %s is_augmented %i = %f'% (m, g , i , \n", + " np.round(accuracy_score(p,ytest[xtest.Married==g]),2)))\n", + " print ('Relative improvement ', np.round(100*(improvement[m][1]/improvement[m][0] - 1),2))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fabulous-wells", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "impossible-kitty", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "talented-anxiety", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/5 - synthetic-data-applications/regular-tabular/fairness/Fairness.ipynb b/5 - synthetic-data-applications/regular-tabular/fairness/Fairness.ipynb new file mode 100644 index 0000000..b8957e2 --- /dev/null +++ b/5 - synthetic-data-applications/regular-tabular/fairness/Fairness.ipynb @@ -0,0 +1,218 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "diverse-creek", + "metadata": {}, + "outputs": [], + "source": [ + "# import generic packages\n", + "import pandas as pd\n", + "import numpy as np\n", + "from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier\n", + "from sklearn.tree import DecisionTreeClassifier\n", + "from sklearn.linear_model import SGDClassifier \n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.preprocessing import LabelEncoder\n", + "import seaborn as sns\n", + "from sklearn.metrics import confusion_matrix, roc_auc_score, accuracy_score\n", + "import matplotlib.pyplot as plt\n", + "import fairlens as fl" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "hollywood-system", + "metadata": {}, + "outputs": [], + "source": [ + "# Import ydata modules\n", + "from ydata.synthesizers.regular.model import BaseModel, RegularSynthesizer\n", + "from ydata.metadata import Metadata\n", + "from ydata.dataset.dataset import Dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "intensive-bunch", + "metadata": {}, + "outputs": [], + "source": [ + "# load loans data (available at kaggle https://www.kaggle.com/code/ajaymanwani/loan-approval-prediction/data)\n", + "data = pd.read_csv('~/Downloads/loans.csv').fillna(0)\n", + "data.shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "sunset-province", + "metadata": {}, + "outputs": [], + "source": [ + "# check that the data in unbalanced for Married status\n", + "sns.countplot(x='Married',data=data, palette = 'Set2') " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "great-effects", + "metadata": {}, + "outputs": [], + "source": [ + "# convert categorical to numerical\n", + "from sklearn.preprocessing import LabelEncoder\n", + "for c in data.columns:\n", + " if data[c].dtype =='O':\n", + " le = LabelEncoder()\n", + " data[c] = le.fit_transform(data[c].astype(str)).astype('int')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "steady-petersburg", + "metadata": {}, + "outputs": [], + "source": [ + "# train a synthetizer\n", + "synthetizer = RegularSynthesizer()\n", + "original = Dataset(data)\n", + "metadata = Metadata()\n", + "synthetizer.fit(original)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "consistent-nursery", + "metadata": {}, + "outputs": [], + "source": [ + "#generate new data\n", + "synth = synthetizer.sample(n_samples=900)\n", + "s = synth.to_pandas()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "featured-seventh", + "metadata": {}, + "outputs": [], + "source": [ + "#create a balanced dataset\n", + "data_balanced = pd.concat([data,s[s.Married==1]],axis=0)\n", + "data_balanced.shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "stock-schema", + "metadata": {}, + "outputs": [], + "source": [ + "# check that new data is balanced\n", + "sns.countplot(x='Married',data=data_balanced, palette = 'Set2') " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "economic-lunch", + "metadata": {}, + "outputs": [], + "source": [ + "# create training a test data\n", + "xtrainf = []\n", + "ytrainf = []\n", + "xtrain, xtest, ytrain, ytest = train_test_split(data.drop(columns='Loan_Status'), data['Loan_Status'], test_size=.2, random_state = 100)\n", + "xtrainf.append(xtrain)\n", + "ytrainf.append(ytrain)\n", + "\n", + "xtrain, _, ytrain, _ = train_test_split(data_balanced.drop(columns='Loan_Status'), data_balanced['Loan_Status'], test_size=.2, random_state = 100)\n", + "ii = [i for i in xtrain.index if i not in xtest.index]\n", + "xtrainf.append(xtrain.loc[ii])\n", + "ytrainf.append(ytrain.loc[ii])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "portable-courage", + "metadata": {}, + "outputs": [], + "source": [ + "# check accuracy of models trained in both data sets\n", + "models = [RandomForestClassifier(),DecisionTreeClassifier(), AdaBoostClassifier()]\n", + "married = [1,2]\n", + "improvement = {}\n", + "\n", + "for m in models:\n", + " improvement[m] = {}\n", + " for i in range(0,2):\n", + " m.fit(xtrainf[i],ytrainf[i])\n", + " p = m.predict(xtest)\n", + "\n", + " improvement[m][i] = np.round(accuracy_score(p,ytest),2)\n", + " print('Overall Accuracy %f'% (improvement[m][i]))\n", + "\n", + " for g in married:\n", + " p = m.predict(xtest[xtest.Married==g])\n", + " print('model %s accuracy, married_status %s is_augmented %i = %f'% (m, g , i , \n", + " np.round(accuracy_score(p,ytest[xtest.Married==g]),2)))\n", + " print ('Relative improvement ', np.round(100*(improvement[m][1]/improvement[m][0] - 1),2))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fabulous-wells", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "impossible-kitty", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "talented-anxiety", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}