kor-nonce-words/stratify.py at main · CUNY-CL/kor-nonce-words · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
#!/usr/bin/env python
"""Implements stratified sampling for Korean.

This produces two lists of 60 words with the same gross properties. For each
list:

* 20 of the words are monosyllables; 40 are disyllables.
* 30 the words are expected to be well-formed; 30 of the words are expected
  to be ill-formed.
"""


import collections
import csv
import random

from typing import Any, Dict, Iterator, Tuple


SEED = 1568
MONOSYLLABLES = "monosyllables-annotated.tsv"
DISYLLABLES = "disyllables-annotated.tsv"
LIST1 = "kor-list-1.tsv"
LIST2 = "kor-list-2.tsv"


def _proc_file(path: str) -> Iterator[Tuple[str, Any]]:
    with open(path, "r") as source:
        for row in csv.DictReader(source, delimiter="\t"):
            if row["lexicality"] != "FALSE":
                continue
            del row["lexicality"]
            del row["memo"]
            shape = row["shape"]
            row["transcription"] = (
                row["onset1"]
                + row["nucleus1"]
                + row["onset2"]
                + row["nucleus2"]
                + row["coda"]
            )
            yield shape, row


def main() -> None:
    random.seed(SEED)  # Same result every time.
    list1 = []
    list2 = []
    by_shape = collections.defaultdict(list)
    for shape, row in _proc_file(MONOSYLLABLES):
        by_shape[shape].append(row)
    for shape, row in _proc_file(DISYLLABLES):
        by_shape[shape].append(row)
    for shape, entries in by_shape.items():
        elist = list(entries)
        # Special cases for sizing.
        if shape in ["CVC", "CwVC", "CNVC", "NCVC"]:
            size = 5
        else:
            size = 10
        assert len(elist) >= size * 2, (shape, len(elist))
        random.shuffle(elist)
        list1.extend(elist[:size])
        list2.extend(elist[size : 2 * size])
    random.shuffle(list1)
    random.shuffle(list2)
    with open(LIST1, "w") as sink:
        writer = csv.DictWriter(sink, delimiter="\t", fieldnames=row.keys())
        writer.writeheader()
        writer.writerows(list1)
    with open(LIST2, "w") as sink:
        writer = csv.DictWriter(sink, delimiter="\t", fieldnames=row.keys())
        writer.writeheader()
        writer.writerows(list2)


if __name__ == "__main__":
    main()