mbilalzafar · joclement · Feb 13, 2018 · Feb 13, 2018 · Feb 14, 2018
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,104 @@
+
+### Vim
+# swap
+[._]*.s[a-w][a-z]
+[._]s[a-w][a-z]
+# session
+Session.vim
+# temporary
+.netrwhist
+*~
+# auto-generated tag files
+tags
+
+### Python
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+env/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*,cover
+.hypothesis/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# IPython Notebook
+.ipynb_checkpoints
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# dotenv
+.env
+
+# virtualenv
+.venv/
+venv/
+ENV/
+
+# Spyder project settings
+.spyderproject
+
+# Rope project settings
+.ropeproject
diff --git a/disparate_impact/README.md b/disparate_impact/README.md
@@ -90,7 +90,7 @@ We can see that the classifier sacrificed some accuracy to achieve similar fract
 The figure shows the original decision boundary (without any constraints) and the shifted decision boundary that was learnt by the fair classifier. Notice how the boundary shifts to push more non-protected points to the negative class (and vice-versa).
 
 
-###1.4. Optimizing fairness subject to accuracy constraints
+### 1.4. Optimizing fairness subject to accuracy constraints
 
 Now lets try to **optimize fairness** (that does not necessarily correspond to a 100% p-rule) **subject to a deterministic loss in accuracy**. The details can be found in Section 3.3 of our [paper](http://arxiv.org/pdf/1507.05259.pdf).
 
@@ -99,7 +99,7 @@ apply_fairness_constraints = 0 # flag for fairness constraint is set back to0 si
 apply_accuracy_constraint = 1 # now, we want to optimize fairness subject to accuracy constraints
 sep_constraint = 0
 gamma = 0.5
-w_a_cons, p_a_cons, acc_a_cons = train_test_classifier()    
+w_a_cons, p_a_cons, acc_a_cons = train_test_classifier()
 ```
 
 The "gamma" variable controls how much loss in accuracy we are willing to take while optimizing for fairness. A larger value of gamma will result in more fair system, but we will be getting a more loss in accuracy.
@@ -166,7 +166,7 @@ The following output is generated:
 
 We can see that decreasing the covariance threshold value gives a continuous trade-off between fairness and accuracy. Specifically, we see that the fractions of protected and non-protected examples in positive class starts to converge (resulting in a greater p-rule), however, we get an increasing drop in accuracy.
 
-###1.7. Adult data
+### 1.7. Adult data
 
 We also provide a demo of our code on [Adult dataset](http://archive.ics.uci.edu/ml/datasets/Adult). For applying the fairness constraints on the adult dataset, execute the following commands:
 

diff --git a/disparate_impact/synthetic_data_demo/decision_boundary_demo.py b/disparate_impact/synthetic_data_demo/decision_boundary_demo.py
@@ -8,7 +8,7 @@
 
 
 def test_synthetic_data():
-	
+
 	""" Generate the synthetic data """
 	X, y, x_control = generate_synthetic_data(plot_data=True) # set plot_data to False to skip the data plot
 	ut.compute_p_rule(x_control["s1"], y) # compute the p-rule in the original data
@@ -37,7 +37,7 @@ def train_test_classifier():
 		all_class_labels_assigned_test = np.sign(distances_boundary_test)
 		correlation_dict_test = ut.get_correlations(None, None, all_class_labels_assigned_test, x_control_test, sensitive_attrs)
 		cov_dict_test = ut.print_covariance_sensitive_attrs(None, x_test, distances_boundary_test, x_control_test, sensitive_attrs)
-		p_rule = ut.print_classifier_fairness_stats([test_score], [correlation_dict_test], [cov_dict_test], sensitive_attrs[0])	
+		p_rule = ut.print_classifier_fairness_stats([test_score], [correlation_dict_test], [cov_dict_test], sensitive_attrs[0])
 		return w, p_rule, test_score
 
 
@@ -83,7 +83,7 @@ def plot_boundaries(w1, w2, p1, p2, acc1, acc2, fname):
 	apply_accuracy_constraint = 0
 	sep_constraint = 0
 	w_uncons, p_uncons, acc_uncons = train_test_classifier()
-	
+
 	""" Now classify such that we optimize for accuracy while achieving perfect fairness """
 	apply_fairness_constraints = 1 # set this flag to one since we want to optimize accuracy subject to fairness constraints
 	apply_accuracy_constraint = 0
@@ -101,11 +101,11 @@ def plot_boundaries(w1, w2, p1, p2, acc1, acc2, fname):
 	sep_constraint = 0
 	gamma = 0.5 # gamma controls how much loss in accuracy we are willing to incur to achieve fairness -- increase gamme to allow more loss in accuracy
 	print "== Classifier with accuracy constraint =="
-	w_a_cons, p_a_cons, acc_a_cons = train_test_classifier()	
-	plot_boundaries(w_uncons, w_a_cons, p_uncons, p_a_cons, acc_uncons, acc_a_cons, "img/a_cons.png")
+	w_a_cons, p_a_cons, acc_a_cons = train_test_classifier()
+	plot_boundaries(w_uncons, w_a_cons, p_uncons, p_a_cons, acc_uncons, acc_a_cons, "img/a_cons")
 
-	""" 
-	Classify such that we optimize for fairness subject to a certain loss in accuracy 
+	"""
+	Classify such that we optimize for fairness subject to a certain loss in accuracy
 	In addition, make sure that no points classified as positive by the unconstrained (original) classifier are misclassified.
 
 	"""
@@ -125,4 +125,4 @@ def main():
 
 
 if __name__ == '__main__':
-	main()
+	main()
diff --git a/disparate_impact/synthetic_data_demo/fairness_acc_tradeoff.py b/disparate_impact/synthetic_data_demo/fairness_acc_tradeoff.py
@@ -8,7 +8,7 @@
 NUM_FOLDS = 10 # we will show 10-fold cross validation accuracy as a performance measure
 
 def test_synthetic_data():
-	
+
 	""" Generate the synthetic data """
 	X, y, x_control = generate_synthetic_data(plot_data=False)
 	ut.compute_p_rule(x_control["s1"], y) # compute the p-rule in the original data
@@ -29,19 +29,17 @@ def test_synthetic_data():
 	""" Now classify such that we achieve perfect fairness """
 	apply_fairness_constraints = 1
 	cov_factor = 0
-	test_acc_arr, train_acc_arr, correlation_dict_test_arr, correlation_dict_train_arr, cov_dict_test_arr, cov_dict_train_arr = ut.compute_cross_validation_error(X, y, x_control, NUM_FOLDS, loss_function, apply_fairness_constraints, apply_accuracy_constraint, sep_constraint, ['s1'], [{'s1':cov_factor} for i in range(0,NUM_FOLDS)])		
+	test_acc_arr, train_acc_arr, correlation_dict_test_arr, correlation_dict_train_arr, cov_dict_test_arr, cov_dict_train_arr = ut.compute_cross_validation_error(X, y, x_control, NUM_FOLDS, loss_function, apply_fairness_constraints, apply_accuracy_constraint, sep_constraint, ['s1'], [{'s1':cov_factor} for i in range(0,NUM_FOLDS)])
 	print
 	print "== Constrained (fair) classifier =="
 	ut.print_classifier_fairness_stats(test_acc_arr, correlation_dict_test_arr, cov_dict_test_arr, "s1")
 
 	""" Now plot a tradeoff between the fairness and accuracy """
 	ut.plot_cov_thresh_vs_acc_pos_ratio(X, y, x_control, NUM_FOLDS, loss_function, apply_fairness_constraints, apply_accuracy_constraint, sep_constraint, ['s1'])
 
-
 
 def main():
 	test_synthetic_data()
 
-
 if __name__ == '__main__':
-	main()
+	main()
diff --git a/disparate_impact/synthetic_data_demo/generate_synthetic_data.py b/disparate_impact/synthetic_data_demo/generate_synthetic_data.py
@@ -40,7 +40,7 @@ def gen_gaussian(mean_in, cov_in, class_label):
     shuffle(perm)
     X = X[perm]
     y = y[perm]
-    
+
     rotation_mult = np.array([[math.cos(disc_factor), -math.sin(disc_factor)], [math.sin(disc_factor), math.cos(disc_factor)]])
     X_aux = np.dot(X, rotation_mult)
 
@@ -53,12 +53,12 @@ def gen_gaussian(mean_in, cov_in, class_label):
         # probability for each cluster that the point belongs to it
         p1 = nv1.pdf(x)
         p2 = nv2.pdf(x)
-        
+
         # normalize the probabilities from 0 to 1
         s = p1+p2
         p1 = p1/s
         p2 = p2/s
-        
+
         r = np.random.uniform() # generate a random number from 0 to 1
 
         if r < p1: # the first cluster is the positive class

diff --git a/disparate_mistreatment/synthetic_data_demo/decision_boundary_demo.py b/disparate_mistreatment/synthetic_data_demo/decision_boundary_demo.py
@@ -9,7 +9,7 @@
 
 
 def test_synthetic_data():
-	
+
 	""" Generate the synthetic data """
 	data_type = 1
 	X, y, x_control = generate_synthetic_data(data_type=data_type, plot_data=True) # set plot_data to False to skip the data plot
@@ -28,10 +28,10 @@ def train_test_classifier():
 
 		train_score, test_score, cov_all_train, cov_all_test, s_attr_to_fp_fn_train, s_attr_to_fp_fn_test = fdm.get_clf_stats(w, x_train, y_train, x_control_train, x_test, y_test, x_control_test, sensitive_attrs)
 
-		
+
 		# accuracy and FPR are for the test because we need of for plotting
 		return w, test_score, s_attr_to_fp_fn_test
-		
+
 
 	""" Classify the data while optimizing for accuracy """
 	print
@@ -40,7 +40,7 @@ def train_test_classifier():
 	print "\n-----------------------------------------------------------------------------------\n"
 
 	""" Now classify such that we optimize for accuracy while achieving perfect fairness """
-	
+
 	print
 	print "== Classifier with fairness constraint =="
 
@@ -50,9 +50,9 @@ def train_test_classifier():
 	tau = 5.0
 	mu = 1.2
 	sensitive_attrs_to_cov_thresh = {"s1": {0:{0:0, 1:0}, 1:{0:0, 1:0}, 2:{0:0, 1:0}}} # zero covariance threshold, means try to get the fairest solution
-	cons_params = {"cons_type": cons_type, 
-					"tau": tau, 
-					"mu": mu, 
+	cons_params = {"cons_type": cons_type,
+					"tau": tau,
+					"mu": mu,
 					"sensitive_attrs_to_cov_thresh": sensitive_attrs_to_cov_thresh}
 
 	w_cons, acc_cons, s_attr_to_fp_fn_test_cons  = train_test_classifier()
@@ -84,4 +84,4 @@ def main():
 
 
 if __name__ == '__main__':
-	main()
+	main()
diff --git a/disparate_mistreatment/synthetic_data_demo/fairness_acc_tradeoff.py b/disparate_mistreatment/synthetic_data_demo/fairness_acc_tradeoff.py
@@ -10,7 +10,7 @@
 import matplotlib.pyplot as plt # for plotting stuff
 
 def test_synthetic_data():
-	
+
 	""" Generate the synthetic data """
 	data_type = 1
 	X, y, x_control = generate_synthetic_data(data_type=data_type, plot_data=False) # set plot_data to False to skip the data plot
@@ -29,11 +29,11 @@ def train_test_classifier():
 
 		train_score, test_score, cov_all_train, cov_all_test, s_attr_to_fp_fn_train, s_attr_to_fp_fn_test = fdm.get_clf_stats(w, x_train, y_train, x_control_train, x_test, y_test, x_control_test, sensitive_attrs)
 
-		
+
 		# accuracy and FPR are for the test because we need of for plotting
 		# the covariance is for train, because we need it for setting the thresholds
 		return w, test_score, s_attr_to_fp_fn_test, cov_all_train
-		
+
 
 	""" Classify the data while optimizing for accuracy """
 	print
@@ -42,7 +42,7 @@ def train_test_classifier():
 	print "\n-----------------------------------------------------------------------------------\n"
 
 	""" Now classify such that we optimize for accuracy while achieving perfect fairness """
-	
+
 	print
 	print "== Classifier with fairness constraint =="
 
@@ -66,13 +66,13 @@ def train_test_classifier():
 					sensitive_attrs_to_cov_thresh[s_attr][cov_type][s_val] *= m
 
 
-		cons_params = {"cons_type": cons_type, 
-						"tau": tau, 
-						"mu": mu, 
+		cons_params = {"cons_type": cons_type,
+						"tau": tau,
+						"mu": mu,
 						"sensitive_attrs_to_cov_thresh": sensitive_attrs_to_cov_thresh}
 
 		w_cons, acc_cons, s_attr_to_fp_fn_test_cons, cov_all_train_cons  = train_test_classifier()
-		
+
 		fpr_per_group[0].append(s_attr_to_fp_fn_test_cons["s1"][0.0]["fpr"])
 		fpr_per_group[1].append(s_attr_to_fp_fn_test_cons["s1"][1.0]["fpr"])
 		fnr_per_group[0].append(s_attr_to_fp_fn_test_cons["s1"][0.0]["fnr"])
@@ -111,4 +111,4 @@ def main():
 
 
 if __name__ == '__main__':
-	main()
+	main()
diff --git a/disparate_mistreatment/synthetic_data_demo/generate_synthetic_data.py b/disparate_mistreatment/synthetic_data_demo/generate_synthetic_data.py
@@ -9,7 +9,7 @@
 SEED = 1122334455
 seed(SEED) # set the random seed so that the random permutations can be reproduced again
 np.random.seed(SEED)
-sys.path.insert(0, '../../fair_classification/') 
+sys.path.insert(0, '../../fair_classification/')
 import utils as ut
 
 
@@ -100,7 +100,7 @@ def gen_gaussian_diff_size(mean_in, cov_in, z_val, class_label, n):
     y = y[perm]
     x_control = x_control[perm]
 
-    
+
     """ Plot the data """
     if plot_data:
         plt.figure()
@@ -131,7 +131,7 @@ def gen_gaussian_diff_size(mean_in, cov_in, z_val, class_label, n):
 
     x_control = {"s1": x_control} # all the sensitive features are stored in a dictionary
     X = ut.add_intercept(X)
-    
+
 
     return X,y,x_control