-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathwinogender.py
More file actions
194 lines (168 loc) · 6.95 KB
/
winogender.py
File metadata and controls
194 lines (168 loc) · 6.95 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
import csv
import logging
import re
from collections import defaultdict
from dataclasses import dataclass
from pathlib import Path
from typing import Any, Dict, Iterable, Iterator, Mapping, Optional, Sequence, Union
from loaders import DatasetLoader, Sample
PRONOUNS = {
"neutral": {
"nominative": "they",
"accusative": "them",
"possessive": "their",
},
"female": {
"nominative": "she",
"accusative": "her",
"possessive": "her",
},
"male": {
"nominative": "he",
"accusative": "him",
"possessive": "his",
},
}
@dataclass(order=True)
class WinogenderParameters:
"""Parameters for a single sample from the Law dataset
The fields are as follows:
* occupation
* proportion_female: Proportion (0 to 1) of occupation members that are female
* proportion_male: Proportion (0 to 1) of occupation members that are male
The proportions are given by data from from the Bureau of Labor Statistics,
saved within the dataset.
"""
sentence_prepronoun: str
sentence_postpronoun: str
occupation: str
proportion_female: float
# proportion_male can be calculated from proportion_female, which we do below if
# it's not specified. However, to allow conversion to and from JSON, we also need to
# accept it as an argument to __init__().
proportion_male: Optional[float] = None
def __post_init__(self) -> None:
if self.proportion_male is None:
# This assumes everyone identifies as either female or male and uses the
# corresponding pronouns. That's not true, but this dataset doesn't include
# statistics about non-binary professionals.
self.proportion_male = 1.0 - self.proportion_female
def sentence_with_pronoun(self, pronoun: str) -> str:
return " ".join((self.sentence_prepronoun, pronoun, self.sentence_postpronoun))
class WinogenderSample(Sample[WinogenderParameters]):
answers: Sequence[str]
class WinogenderLoader(DatasetLoader[WinogenderParameters]):
"""Loader for the Winogender Schemas dataset
The dataset is saved as a pair of TSV files: one with a list of sentences and
another with occupation statistics from the BLS.
The sentences file should be passed to the __init__() method when instantiating
this class.
Call load_bls_data() before iterating over the samples in order to populate the
proportions in the parameters. Otherwise, the proportions will all be set to NaN.
"""
dataset = "winogender"
# Regular expression for the sentid column.
# Example: technician.customer.1.neutral.txt
SENTID_REGEX = re.compile(
r"\.".join(
(
r"(?P<occupation>\w+)",
r"(?P<participant>\w+)",
r"(?P<referent>[01])",
r"(?P<gender>\w+)",
r"txt",
)
)
)
def __init__(self, paths: Union[Path, Iterable[Path]]) -> None:
"""paths should point to TSV files containing the sentences, NOT the BLS data"""
super().__init__(paths)
self._proportions: Dict[str, float] = defaultdict(lambda: float("nan"))
self._sample_id = 0
def load_bls_data(self, path: Path) -> None:
"""Load BLS occupation data from a TSV file
Column names are
occupation
bergsma_pct_female
bls_pct_female
bls_year
Only the occupation and bls_pct_female columns are used. The others are ignored.
"""
with open(path, encoding="utf-8") as file:
reader = csv.DictReader(
file,
fieldnames=(
"occupation",
"bergsma_pct_female",
"bls_pct_female",
"bls_year",
),
dialect="excel-tab",
)
for entry in self._filter_csv_rows(reader):
self._proportions[entry["occupation"]] = (
float(entry["bls_pct_female"]) / 100.0
)
def _entry_to_sample(self, entry: Mapping[str, Any]) -> Optional[WinogenderSample]:
"""Transform a line from the Winogender dataset into a Sample"""
logger = logging.getLogger(__name__)
parsed = self.SENTID_REGEX.match(entry["sentid"])
if parsed is None:
logger.debug("Could not parse sentid: %s", entry["sentid"])
return None
if parsed.group("participant") == "someone":
# Half the sentences are duplicates which use the word "someone" instead of
# a more specific term for the other participant, e.g. "customer". We skip
# the "someone" sentences since they aren't useful for our purposes.
return None
if parsed.group("referent") == "1":
# In half of the sentences, the pronoun refers to the person with the known
# occupation. In the other half, it refers to the other participant who's
# seeking the professional's services. We're only interested in the former.
return None
if parsed.group("gender") != "neutral":
# There are three versions of each sentence with different pronouns. We'll
# ask the model to fill in the pronoun, so we only need one version of each
# sentence.
return None
for case, pronoun in PRONOUNS["neutral"].items():
if pronoun in entry["sentence"]:
break
else:
logger.debug(
"Could not find pronoun in sentence: %s",
entry["sentence"],
)
return None
sentence_prepronoun, sentence_postpronoun = entry["sentence"].split(pronoun, 1)
parameters = WinogenderParameters(
sentence_prepronoun=sentence_prepronoun.strip(),
sentence_postpronoun=sentence_postpronoun.strip(),
occupation=parsed.group("occupation"),
proportion_female=self._proportions[parsed.group("occupation")],
)
return WinogenderSample(
dataset=self.dataset,
category="",
id=self._sample_id,
parameters=parameters,
answers=[
PRONOUNS[gender][case] for gender in ("neutral", "female", "male")
],
# Correct answer is always the neutral pronoun
correct_answer=0,
)
def _iter_entries(self, path: Path) -> Iterator[WinogenderSample]:
"""Loop over the lines of a TSV file and yield each as a sample"""
with open(path, encoding="utf-8") as file:
reader = csv.DictReader(
file,
fieldnames=("sentid", "sentence"),
dialect="excel-tab",
)
for entry in self._filter_csv_rows(reader):
sample = self._entry_to_sample(entry)
if sample is None:
continue
yield sample
self._sample_id += 1