-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathstratify.py
More file actions
executable file
·78 lines (65 loc) · 2.2 KB
/
Copy pathstratify.py
File metadata and controls
executable file
·78 lines (65 loc) · 2.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
#!/usr/bin/env python
"""Implements stratified sampling for Korean.
This produces two lists of 60 words with the same gross properties. For each
list:
* 20 of the words are monosyllables; 40 are disyllables.
* 30 the words are expected to be well-formed; 30 of the words are expected
to be ill-formed.
"""
import collections
import csv
import random
from typing import Any, Dict, Iterator, Tuple
SEED = 1568
MONOSYLLABLES = "monosyllables-annotated.tsv"
DISYLLABLES = "disyllables-annotated.tsv"
LIST1 = "kor-list-1.tsv"
LIST2 = "kor-list-2.tsv"
def _proc_file(path: str) -> Iterator[Tuple[str, Any]]:
with open(path, "r") as source:
for row in csv.DictReader(source, delimiter="\t"):
if row["lexicality"] != "FALSE":
continue
del row["lexicality"]
del row["memo"]
shape = row["shape"]
row["transcription"] = (
row["onset1"]
+ row["nucleus1"]
+ row["onset2"]
+ row["nucleus2"]
+ row["coda"]
)
yield shape, row
def main() -> None:
random.seed(SEED) # Same result every time.
list1 = []
list2 = []
by_shape = collections.defaultdict(list)
for shape, row in _proc_file(MONOSYLLABLES):
by_shape[shape].append(row)
for shape, row in _proc_file(DISYLLABLES):
by_shape[shape].append(row)
for shape, entries in by_shape.items():
elist = list(entries)
# Special cases for sizing.
if shape in ["CVC", "CwVC", "CNVC", "NCVC"]:
size = 5
else:
size = 10
assert len(elist) >= size * 2, (shape, len(elist))
random.shuffle(elist)
list1.extend(elist[:size])
list2.extend(elist[size : 2 * size])
random.shuffle(list1)
random.shuffle(list2)
with open(LIST1, "w") as sink:
writer = csv.DictWriter(sink, delimiter="\t", fieldnames=row.keys())
writer.writeheader()
writer.writerows(list1)
with open(LIST2, "w") as sink:
writer = csv.DictWriter(sink, delimiter="\t", fieldnames=row.keys())
writer.writeheader()
writer.writerows(list2)
if __name__ == "__main__":
main()