Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
218 changes: 218 additions & 0 deletions 5 - synthetic-data-applications/fairness/Fairness.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,218 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "diverse-creek",
"metadata": {},
"outputs": [],
"source": [
"# import generic packages\n",
"import pandas as pd\n",
"import numpy as np\n",
"from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier\n",
"from sklearn.tree import DecisionTreeClassifier\n",
"from sklearn.linear_model import SGDClassifier \n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.preprocessing import LabelEncoder\n",
"import seaborn as sns\n",
"from sklearn.metrics import confusion_matrix, roc_auc_score, accuracy_score\n",
"import matplotlib.pyplot as plt\n",
"import fairlens as fl"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "hollywood-system",
"metadata": {},
"outputs": [],
"source": [
"# Import ydata modules\n",
"from ydata.synthesizers.regular.model import BaseModel, RegularSynthesizer\n",
"from ydata.metadata import Metadata\n",
"from ydata.dataset.dataset import Dataset"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "intensive-bunch",
"metadata": {},
"outputs": [],
"source": [
"# load loans data (available at kaggle https://www.kaggle.com/code/ajaymanwani/loan-approval-prediction/data)\n",
"data = pd.read_csv('~/Downloads/loans.csv').fillna(0)\n",
"data.shape"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "sunset-province",
"metadata": {},
"outputs": [],
"source": [
"# check that the data in unbalanced for Married status\n",
"sns.countplot(x='Married',data=data, palette = 'Set2') "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "great-effects",
"metadata": {},
"outputs": [],
"source": [
"# convert categorical to numerical\n",
"from sklearn.preprocessing import LabelEncoder\n",
"for c in data.columns:\n",
" if data[c].dtype =='O':\n",
" le = LabelEncoder()\n",
" data[c] = le.fit_transform(data[c].astype(str)).astype('int')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "steady-petersburg",
"metadata": {},
"outputs": [],
"source": [
"# train a synthetizer\n",
"synthetizer = RegularSynthesizer()\n",
"original = Dataset(data)\n",
"metadata = Metadata()\n",
"synthetizer.fit(original)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "consistent-nursery",
"metadata": {},
"outputs": [],
"source": [
"#generate new data\n",
"synth = synthetizer.sample(n_samples=900)\n",
"s = synth.to_pandas()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "featured-seventh",
"metadata": {},
"outputs": [],
"source": [
"#create a balanced dataset\n",
"data_balanced = pd.concat([data,s[s.Married==1]],axis=0)\n",
"data_balanced.shape"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "stock-schema",
"metadata": {},
"outputs": [],
"source": [
"# check that new data is balanced\n",
"sns.countplot(x='Married',data=data_balanced, palette = 'Set2') "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "economic-lunch",
"metadata": {},
"outputs": [],
"source": [
"# create training a test data\n",
"xtrainf = []\n",
"ytrainf = []\n",
"xtrain, xtest, ytrain, ytest = train_test_split(data.drop(columns='Loan_Status'), data['Loan_Status'], test_size=.2, random_state = 100)\n",
"xtrainf.append(xtrain)\n",
"ytrainf.append(ytrain)\n",
"\n",
"xtrain, _, ytrain, _ = train_test_split(data_balanced.drop(columns='Loan_Status'), data_balanced['Loan_Status'], test_size=.2, random_state = 100)\n",
"ii = [i for i in xtrain.index if i not in xtest.index]\n",
"xtrainf.append(xtrain.loc[ii])\n",
"ytrainf.append(ytrain.loc[ii])"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "portable-courage",
"metadata": {},
"outputs": [],
"source": [
"# check accuracy of models trained in both data sets\n",
"models = [RandomForestClassifier(),DecisionTreeClassifier(), AdaBoostClassifier()]\n",
"married = [1,2]\n",
"improvement = {}\n",
"\n",
"for m in models:\n",
" improvement[m] = {}\n",
" for i in range(0,2):\n",
" m.fit(xtrainf[i],ytrainf[i])\n",
" p = m.predict(xtest)\n",
"\n",
" improvement[m][i] = np.round(accuracy_score(p,ytest),2)\n",
" print('Overall Accuracy %f'% (improvement[m][i]))\n",
"\n",
" for g in married:\n",
" p = m.predict(xtest[xtest.Married==g])\n",
" print('model %s accuracy, married_status %s is_augmented %i = %f'% (m, g , i , \n",
" np.round(accuracy_score(p,ytest[xtest.Married==g]),2)))\n",
" print ('Relative improvement ', np.round(100*(improvement[m][1]/improvement[m][0] - 1),2))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "fabulous-wells",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "impossible-kitty",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "talented-anxiety",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Loading