Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
104 changes: 104 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@

### Vim
# swap
[._]*.s[a-w][a-z]
[._]s[a-w][a-z]
# session
Session.vim
# temporary
.netrwhist
*~
# auto-generated tag files
tags

### Python
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
env/
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
*.egg-info/
.installed.cfg
*.egg

# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*,cover
.hypothesis/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
target/

# IPython Notebook
.ipynb_checkpoints

# pyenv
.python-version

# celery beat schedule file
celerybeat-schedule

# dotenv
.env

# virtualenv
.venv/
venv/
ENV/

# Spyder project settings
.spyderproject

# Rope project settings
.ropeproject
6 changes: 3 additions & 3 deletions disparate_impact/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ We can see that the classifier sacrificed some accuracy to achieve similar fract
The figure shows the original decision boundary (without any constraints) and the shifted decision boundary that was learnt by the fair classifier. Notice how the boundary shifts to push more non-protected points to the negative class (and vice-versa).


###1.4. Optimizing fairness subject to accuracy constraints
### 1.4. Optimizing fairness subject to accuracy constraints

Now lets try to **optimize fairness** (that does not necessarily correspond to a 100% p-rule) **subject to a deterministic loss in accuracy**. The details can be found in Section 3.3 of our [paper](http://arxiv.org/pdf/1507.05259.pdf).

Expand All @@ -99,7 +99,7 @@ apply_fairness_constraints = 0 # flag for fairness constraint is set back to0 si
apply_accuracy_constraint = 1 # now, we want to optimize fairness subject to accuracy constraints
sep_constraint = 0
gamma = 0.5
w_a_cons, p_a_cons, acc_a_cons = train_test_classifier()
w_a_cons, p_a_cons, acc_a_cons = train_test_classifier()
```

The "gamma" variable controls how much loss in accuracy we are willing to take while optimizing for fairness. A larger value of gamma will result in more fair system, but we will be getting a more loss in accuracy.
Expand Down Expand Up @@ -166,7 +166,7 @@ The following output is generated:

We can see that decreasing the covariance threshold value gives a continuous trade-off between fairness and accuracy. Specifically, we see that the fractions of protected and non-protected examples in positive class starts to converge (resulting in a greater p-rule), however, we get an increasing drop in accuracy.

###1.7. Adult data
### 1.7. Adult data

We also provide a demo of our code on [Adult dataset](http://archive.ics.uci.edu/ml/datasets/Adult). For applying the fairness constraints on the adult dataset, execute the following commands:

Expand Down
16 changes: 8 additions & 8 deletions disparate_impact/synthetic_data_demo/decision_boundary_demo.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@


def test_synthetic_data():

""" Generate the synthetic data """
X, y, x_control = generate_synthetic_data(plot_data=True) # set plot_data to False to skip the data plot
ut.compute_p_rule(x_control["s1"], y) # compute the p-rule in the original data
Expand Down Expand Up @@ -37,7 +37,7 @@ def train_test_classifier():
all_class_labels_assigned_test = np.sign(distances_boundary_test)
correlation_dict_test = ut.get_correlations(None, None, all_class_labels_assigned_test, x_control_test, sensitive_attrs)
cov_dict_test = ut.print_covariance_sensitive_attrs(None, x_test, distances_boundary_test, x_control_test, sensitive_attrs)
p_rule = ut.print_classifier_fairness_stats([test_score], [correlation_dict_test], [cov_dict_test], sensitive_attrs[0])
p_rule = ut.print_classifier_fairness_stats([test_score], [correlation_dict_test], [cov_dict_test], sensitive_attrs[0])
return w, p_rule, test_score


Expand Down Expand Up @@ -83,7 +83,7 @@ def plot_boundaries(w1, w2, p1, p2, acc1, acc2, fname):
apply_accuracy_constraint = 0
sep_constraint = 0
w_uncons, p_uncons, acc_uncons = train_test_classifier()

""" Now classify such that we optimize for accuracy while achieving perfect fairness """
apply_fairness_constraints = 1 # set this flag to one since we want to optimize accuracy subject to fairness constraints
apply_accuracy_constraint = 0
Expand All @@ -101,11 +101,11 @@ def plot_boundaries(w1, w2, p1, p2, acc1, acc2, fname):
sep_constraint = 0
gamma = 0.5 # gamma controls how much loss in accuracy we are willing to incur to achieve fairness -- increase gamme to allow more loss in accuracy
print "== Classifier with accuracy constraint =="
w_a_cons, p_a_cons, acc_a_cons = train_test_classifier()
plot_boundaries(w_uncons, w_a_cons, p_uncons, p_a_cons, acc_uncons, acc_a_cons, "img/a_cons.png")
w_a_cons, p_a_cons, acc_a_cons = train_test_classifier()
plot_boundaries(w_uncons, w_a_cons, p_uncons, p_a_cons, acc_uncons, acc_a_cons, "img/a_cons")

"""
Classify such that we optimize for fairness subject to a certain loss in accuracy
"""
Classify such that we optimize for fairness subject to a certain loss in accuracy
In addition, make sure that no points classified as positive by the unconstrained (original) classifier are misclassified.

"""
Expand All @@ -125,4 +125,4 @@ def main():


if __name__ == '__main__':
main()
main()
8 changes: 3 additions & 5 deletions disparate_impact/synthetic_data_demo/fairness_acc_tradeoff.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
NUM_FOLDS = 10 # we will show 10-fold cross validation accuracy as a performance measure

def test_synthetic_data():

""" Generate the synthetic data """
X, y, x_control = generate_synthetic_data(plot_data=False)
ut.compute_p_rule(x_control["s1"], y) # compute the p-rule in the original data
Expand All @@ -29,19 +29,17 @@ def test_synthetic_data():
""" Now classify such that we achieve perfect fairness """
apply_fairness_constraints = 1
cov_factor = 0
test_acc_arr, train_acc_arr, correlation_dict_test_arr, correlation_dict_train_arr, cov_dict_test_arr, cov_dict_train_arr = ut.compute_cross_validation_error(X, y, x_control, NUM_FOLDS, loss_function, apply_fairness_constraints, apply_accuracy_constraint, sep_constraint, ['s1'], [{'s1':cov_factor} for i in range(0,NUM_FOLDS)])
test_acc_arr, train_acc_arr, correlation_dict_test_arr, correlation_dict_train_arr, cov_dict_test_arr, cov_dict_train_arr = ut.compute_cross_validation_error(X, y, x_control, NUM_FOLDS, loss_function, apply_fairness_constraints, apply_accuracy_constraint, sep_constraint, ['s1'], [{'s1':cov_factor} for i in range(0,NUM_FOLDS)])
print
print "== Constrained (fair) classifier =="
ut.print_classifier_fairness_stats(test_acc_arr, correlation_dict_test_arr, cov_dict_test_arr, "s1")

""" Now plot a tradeoff between the fairness and accuracy """
ut.plot_cov_thresh_vs_acc_pos_ratio(X, y, x_control, NUM_FOLDS, loss_function, apply_fairness_constraints, apply_accuracy_constraint, sep_constraint, ['s1'])



def main():
test_synthetic_data()


if __name__ == '__main__':
main()
main()
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def gen_gaussian(mean_in, cov_in, class_label):
shuffle(perm)
X = X[perm]
y = y[perm]

rotation_mult = np.array([[math.cos(disc_factor), -math.sin(disc_factor)], [math.sin(disc_factor), math.cos(disc_factor)]])
X_aux = np.dot(X, rotation_mult)

Expand All @@ -53,12 +53,12 @@ def gen_gaussian(mean_in, cov_in, class_label):
# probability for each cluster that the point belongs to it
p1 = nv1.pdf(x)
p2 = nv2.pdf(x)

# normalize the probabilities from 0 to 1
s = p1+p2
p1 = p1/s
p2 = p2/s

r = np.random.uniform() # generate a random number from 0 to 1

if r < p1: # the first cluster is the positive class
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@


def test_synthetic_data():

""" Generate the synthetic data """
data_type = 1
X, y, x_control = generate_synthetic_data(data_type=data_type, plot_data=True) # set plot_data to False to skip the data plot
Expand All @@ -28,10 +28,10 @@ def train_test_classifier():

train_score, test_score, cov_all_train, cov_all_test, s_attr_to_fp_fn_train, s_attr_to_fp_fn_test = fdm.get_clf_stats(w, x_train, y_train, x_control_train, x_test, y_test, x_control_test, sensitive_attrs)


# accuracy and FPR are for the test because we need of for plotting
return w, test_score, s_attr_to_fp_fn_test


""" Classify the data while optimizing for accuracy """
print
Expand All @@ -40,7 +40,7 @@ def train_test_classifier():
print "\n-----------------------------------------------------------------------------------\n"

""" Now classify such that we optimize for accuracy while achieving perfect fairness """

print
print "== Classifier with fairness constraint =="

Expand All @@ -50,9 +50,9 @@ def train_test_classifier():
tau = 5.0
mu = 1.2
sensitive_attrs_to_cov_thresh = {"s1": {0:{0:0, 1:0}, 1:{0:0, 1:0}, 2:{0:0, 1:0}}} # zero covariance threshold, means try to get the fairest solution
cons_params = {"cons_type": cons_type,
"tau": tau,
"mu": mu,
cons_params = {"cons_type": cons_type,
"tau": tau,
"mu": mu,
"sensitive_attrs_to_cov_thresh": sensitive_attrs_to_cov_thresh}

w_cons, acc_cons, s_attr_to_fp_fn_test_cons = train_test_classifier()
Expand Down Expand Up @@ -84,4 +84,4 @@ def main():


if __name__ == '__main__':
main()
main()
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
import matplotlib.pyplot as plt # for plotting stuff

def test_synthetic_data():

""" Generate the synthetic data """
data_type = 1
X, y, x_control = generate_synthetic_data(data_type=data_type, plot_data=False) # set plot_data to False to skip the data plot
Expand All @@ -29,11 +29,11 @@ def train_test_classifier():

train_score, test_score, cov_all_train, cov_all_test, s_attr_to_fp_fn_train, s_attr_to_fp_fn_test = fdm.get_clf_stats(w, x_train, y_train, x_control_train, x_test, y_test, x_control_test, sensitive_attrs)


# accuracy and FPR are for the test because we need of for plotting
# the covariance is for train, because we need it for setting the thresholds
return w, test_score, s_attr_to_fp_fn_test, cov_all_train


""" Classify the data while optimizing for accuracy """
print
Expand All @@ -42,7 +42,7 @@ def train_test_classifier():
print "\n-----------------------------------------------------------------------------------\n"

""" Now classify such that we optimize for accuracy while achieving perfect fairness """

print
print "== Classifier with fairness constraint =="

Expand All @@ -66,13 +66,13 @@ def train_test_classifier():
sensitive_attrs_to_cov_thresh[s_attr][cov_type][s_val] *= m


cons_params = {"cons_type": cons_type,
"tau": tau,
"mu": mu,
cons_params = {"cons_type": cons_type,
"tau": tau,
"mu": mu,
"sensitive_attrs_to_cov_thresh": sensitive_attrs_to_cov_thresh}

w_cons, acc_cons, s_attr_to_fp_fn_test_cons, cov_all_train_cons = train_test_classifier()

fpr_per_group[0].append(s_attr_to_fp_fn_test_cons["s1"][0.0]["fpr"])
fpr_per_group[1].append(s_attr_to_fp_fn_test_cons["s1"][1.0]["fpr"])
fnr_per_group[0].append(s_attr_to_fp_fn_test_cons["s1"][0.0]["fnr"])
Expand Down Expand Up @@ -111,4 +111,4 @@ def main():


if __name__ == '__main__':
main()
main()
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
SEED = 1122334455
seed(SEED) # set the random seed so that the random permutations can be reproduced again
np.random.seed(SEED)
sys.path.insert(0, '../../fair_classification/')
sys.path.insert(0, '../../fair_classification/')
import utils as ut


Expand Down Expand Up @@ -100,7 +100,7 @@ def gen_gaussian_diff_size(mean_in, cov_in, z_val, class_label, n):
y = y[perm]
x_control = x_control[perm]


""" Plot the data """
if plot_data:
plt.figure()
Expand Down Expand Up @@ -131,7 +131,7 @@ def gen_gaussian_diff_size(mean_in, cov_in, z_val, class_label, n):

x_control = {"s1": x_control} # all the sensitive features are stored in a dictionary
X = ut.add_intercept(X)


return X,y,x_control

Loading