-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathbenchmark_ood.py
More file actions
150 lines (122 loc) · 5.33 KB
/
Copy pathbenchmark_ood.py
File metadata and controls
150 lines (122 loc) · 5.33 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
"""
Out-of-distribution benchmark: evaluate every disambiguation approach on real
Portuguese sentences, not synthetic ones.
The OOD set is the public Hugging Face dataset
``TigreGotico/bifonia-pt-homographs-wild`` (real Wikipedia + web sentences, one
meaning labelled per sentence). It is downloaded on demand, so no third-party text
lives in this repository. This is the honest generalisation number — synthetic
benchmarks (``benchmark_tagger.py``) run several points higher because their train
and test sentences share phrasing.
Scores: most-common · rules (no corpus) · Naive-Bayes · perceptron · shipped ensemble.
Usage::
pip install huggingface_hub
python benchmark_ood.py
"""
import json
import pathlib
from collections import Counter, defaultdict
from bifonia import tokenize, guess_sense
from bifonia.data import POS_SENSES
from bifonia.scoring import guess_pos as _rule_pos, resolve_sense as _rule_resolve
from bifonia.model import SenseModel, NB_PATH, PERCEPTRON_PATH
ROOT = pathlib.Path(__file__).parent
TRAIN = ROOT / "hf" / "train.jsonl"
HF_REPO = "TigreGotico/bifonia-pt-homographs-wild"
TAG_CACHE = ROOT / "scratch" / "ood_tagger_cache.json"
def _load_ood():
from huggingface_hub import hf_hub_download
path = hf_hub_download(HF_REPO, "test.jsonl", repo_type="dataset")
return [json.loads(l) for l in pathlib.Path(path).read_text(encoding="utf-8").splitlines() if l.strip()]
def _tag(recs):
"""{sentence: {spacy, stanza}} for the homograph token; cached to scratch/."""
import warnings
warnings.filterwarnings("ignore")
words = {r["sentence"]: r["word"] for r in recs}
if TAG_CACHE.exists():
cache = json.loads(TAG_CACHE.read_text(encoding="utf-8"))
if all(s in cache for s in words):
return cache
cache = {s: {} for s in words}
sents = list(words)
def tok_pos(tagged, w):
for t, p in tagged:
if t.lower() == w:
return p
return None
try:
import spacy
nlp = spacy.load("pt_core_news_lg", disable=["parser", "ner", "lemmatizer"])
for i, doc in enumerate(nlp.pipe(sents, batch_size=512)):
cache[sents[i]]["spacy"] = tok_pos([(t.text, t.pos_) for t in doc], words[sents[i]])
except Exception as e:
print(f" (spaCy unavailable: {e})")
try:
import stanza
nlp_s = stanza.Pipeline("pt", processors="tokenize,pos", verbose=False,
tokenize_no_ssplit=True)
for k in range(0, len(sents), 256):
batch = sents[k:k + 256]
for s, doc in zip(batch, nlp_s.bulk_process(
[stanza.Document([], text=s) for s in batch])):
tagged = [(w.text, w.upos) for sent in doc.sentences for w in sent.words]
cache[s]["stanza"] = tok_pos(tagged, words[s])
except Exception as e:
print(f" (Stanza unavailable: {e})")
if TAG_CACHE.parent.exists():
TAG_CACHE.write_text(json.dumps(cache, ensure_ascii=False), encoding="utf-8")
return cache
def main():
recs = _load_ood()
train = [json.loads(l) for l in TRAIN.read_text(encoding="utf-8").splitlines() if l.strip()]
freq = defaultdict(Counter)
for r in train:
freq[r["word"]][r["sense"]] += 1
most_common = {w: c.most_common(1)[0][0] for w, c in freq.items()}
def pos_to_sense(word, upos):
cands = POS_SENSES.get(word, {}).get(upos)
if cands:
return sorted(cands, key=lambda s: -freq[word][s])[0]
return most_common.get(word)
print("tagging OOD sentences (spaCy + Stanza)…")
tags = _tag(recs)
nb = SenseModel.load(str(NB_PATH))
perc = SenseModel.load(str(PERCEPTRON_PATH))
def toks(r):
t = tokenize(r["sentence"].lower())
return (t, t.index(r["word"])) if r["word"] in t else (None, None)
def rules(r):
t, i = toks(r)
return _rule_resolve(r["word"], t, i, _rule_pos(t, i)) if t else None
def model(m, r):
t, i = toks(r)
return m.predict(r["word"], t, i) if t and m.has(r["word"]) else None
def shipped(r):
t, i = toks(r)
return guess_sense(t, i) if t else None
appr = {
"most-common": lambda r: most_common.get(r["word"]),
"spaCy": lambda r: pos_to_sense(r["word"], tags.get(r["sentence"], {}).get("spacy")),
"Stanza": lambda r: pos_to_sense(r["word"], tags.get(r["sentence"], {}).get("stanza")),
"rules(free)": rules,
"NB": lambda r: model(nb, r),
"perceptron": lambda r: model(perc, r),
"shipped": shipped,
}
print(f"OOD set: {len(recs)} real sentences, {len({r['word'] for r in recs})} words\n")
print(f"{'approach':<14} accuracy")
for name, fn in appr.items():
ok = sum(1 for r in recs if fn(r) == r["sense"])
print(f"{name:<14} {ok / len(recs) * 100:6.2f}% ({ok}/{len(recs)})")
print("\nper-word (n | rules | NB | perceptron | shipped):")
byw = defaultdict(list)
for r in recs:
byw[r["word"]].append(r)
for w in sorted(byw):
rs = byw[w]
n = len(rs)
def acc(fn):
return sum(1 for r in rs if fn(r) == r["sense"]) / n * 100
print(f" {w:<12} {n:>3} | {acc(rules):5.0f} | {acc(lambda r: model(nb, r)):5.0f} | "
f"{acc(lambda r: model(perc, r)):5.0f} | {acc(shipped):5.0f}")
if __name__ == "__main__":
main()