-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdataset.py
More file actions
122 lines (98 loc) · 4.23 KB
/
Copy pathdataset.py
File metadata and controls
122 lines (98 loc) · 4.23 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
"""
Export the bifonia labeled corpus as CSV, JSON, and HuggingFace-compatible splits.
Output files
------------
dataset.csv — full corpus: word, sense, pos, ipa, diacritized, sentence, diacritized_sentence
dataset.json — same as JSON array
hf/train.jsonl — 80% split, stratified per (word, sense), for HF upload
hf/test.jsonl — 20% split
The bucket key is MEANING (`sense`); `pos` is a descriptive attribute. The
`diacritized*` columns carry the disambiguating diacritic (acute = open vowel,
circumflex = closed) restored on the homograph — the target for a
diacritics-restoration model.
Usage::
python dataset.py [--out <dir>] [--hf]
"""
import csv
import json
import argparse
import pathlib
import random
import re
from collections import Counter, defaultdict
from bifonia import HOMOGRAPHS
from bifonia.data import SENSE_POS
from bifonia.__init__ import _DIACRITIZED
from bifonia.corpus import iter_records
def _diacritized_sentence(sentence: str, word: str, sense: str) -> str:
diac = _DIACRITIZED.get((word, sense))
if not diac:
return sentence
return re.sub(rf"\b{re.escape(word)}\b", diac, sentence, flags=re.IGNORECASE)
def build_records() -> list[dict]:
records = []
for word, sense, sentence in iter_records():
records.append({
"word": word,
"sense": sense,
"pos": SENSE_POS.get(word, {}).get(sense, ""),
"ipa": HOMOGRAPHS.get(word, {}).get(sense, ""),
"diacritized": _DIACRITIZED.get((word, sense), word),
"sentence": sentence,
"diacritized_sentence": _diacritized_sentence(sentence, word, sense),
})
return records
def stratified_split(records: list[dict], test_frac: float = 0.2,
seed: int = 42) -> tuple[list[dict], list[dict]]:
"""Stratified, shuffled 80/20 split per (word, sense) so every bucket is
represented i.i.d. in both splits."""
rng = random.Random(seed)
by_class: dict = defaultdict(list)
for r in records:
by_class[(r["word"], r["sense"])].append(r)
train, test = [], []
for key, group in sorted(by_class.items()):
rng.shuffle(group)
n_test = max(1, int(len(group) * test_frac))
test.extend(group[:n_test])
train.extend(group[n_test:])
rng.shuffle(train)
rng.shuffle(test)
return train, test
def write_jsonl(records: list[dict], path: pathlib.Path) -> None:
with path.open("w", encoding="utf-8") as fh:
for r in records:
fh.write(json.dumps(r, ensure_ascii=False) + "\n")
def main():
parser = argparse.ArgumentParser(description="Export bifonia corpus.")
parser.add_argument("--out", default=".", help="Output directory (default: current)")
parser.add_argument("--hf", action="store_true",
help="Also write HF train/test JSONL splits under <out>/hf/")
parser.add_argument("--seed", type=int, default=42, help="Random seed for split")
args = parser.parse_args()
out = pathlib.Path(args.out)
out.mkdir(parents=True, exist_ok=True)
records = build_records()
fields = ["word", "sense", "pos", "ipa", "diacritized", "sentence", "diacritized_sentence"]
csv_path = out / "dataset.csv"
with csv_path.open("w", newline="", encoding="utf-8") as fh:
writer = csv.DictWriter(fh, fieldnames=fields)
writer.writeheader()
writer.writerows(records)
print(f"Wrote {len(records)} rows → {csv_path}")
json_path = out / "dataset.json"
with json_path.open("w", encoding="utf-8") as fh:
json.dump(records, fh, ensure_ascii=False, indent=2)
print(f"Wrote {len(records)} records → {json_path}")
if args.hf:
hf_dir = out / "hf"
hf_dir.mkdir(exist_ok=True)
train, test = stratified_split(records, seed=args.seed)
write_jsonl(train, hf_dir / "train.jsonl")
write_jsonl(test, hf_dir / "test.jsonl")
print(f"HF splits → {hf_dir}/ (train={len(train)}, test={len(test)})")
by_word = Counter(r["word"] for r in records)
by_sense = Counter(f"{r['word']}/{r['sense']}" for r in records)
print(f"\nTotal: {len(records)} sentences across {len(by_word)} words, {len(by_sense)} senses")
if __name__ == "__main__":
main()