-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdebias.py
More file actions
74 lines (61 loc) · 2.57 KB
/
debias.py
File metadata and controls
74 lines (61 loc) · 2.57 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
from __future__ import print_function, division
import we
import json
import numpy as np
import argparse
import sys
if sys.version_info[0] < 3:
import io
open = io.open
"""
Hard-debias embedding
Man is to Computer Programmer as Woman is to Homemaker? Debiasing Word Embeddings
Tolga Bolukbasi, Kai-Wei Chang, James Zou, Venkatesh Saligrama, and Adam Kalai
2016
"""
def debias(E, gender_specific_words, definitional, equalize):
gender_direction = we.doPCA(definitional, E).components_[0]
specific_set = set(gender_specific_words)
for i, w in enumerate(E.words):
if w not in specific_set:
E.vecs[i] = we.drop(E.vecs[i], gender_direction)
E.normalize()
candidates = {x for e1, e2 in equalize for x in [(e1.lower(), e2.lower()),
(e1.title(), e2.title()),
(e1.upper(), e2.upper())]}
print(candidates)
for (a, b) in candidates:
if (a in E.index and b in E.index):
y = we.drop((E.v(a) + E.v(b)) / 2, gender_direction)
z = np.sqrt(1 - np.linalg.norm(y)**2)
if (E.v(a) - E.v(b)).dot(gender_direction) < 0:
z = -z
E.vecs[E.index[a]] = z * gender_direction + y
E.vecs[E.index[b]] = -z * gender_direction + y
E.normalize()
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("embedding_filename", help="The name of the embedding")
parser.add_argument("definitional_filename", help="JSON of definitional pairs")
parser.add_argument("gendered_words_filename", help="File containing words not to neutralize (one per line)")
parser.add_argument("equalize_filename", help="???.bin")
parser.add_argument("debiased_filename", help="???.bin")
args = parser.parse_args()
#print(args)
with open(args.definitional_filename, "r") as f:
defs = json.load(f)
#print("definitional", defs)
with open(args.equalize_filename, "r") as f:
equalize_pairs = json.load(f)
with open(args.gendered_words_filename, "r") as f:
gender_specific_words = json.load(f)
#print("gender specific", len(gender_specific_words), gender_specific_words[:10])
E = we.WordEmbedding(args.embedding_filename)
print("Debiasing...")
debias(E, gender_specific_words, defs, equalize_pairs)
print("Saving to file...")
if args.embedding_filename[-4:] == args.debiased_filename[-4:] == ".bin":
E.save_w2v(args.debiased_filename)
else:
E.save(args.debiased_filename)
print("\n\nDone!\n")