bifonia/benchmark_ood.py at dev · TigreGotico/bifonia · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
"""
Out-of-distribution benchmark: evaluate every disambiguation approach on real
Portuguese sentences, not synthetic ones.

The OOD set is the public Hugging Face dataset
``TigreGotico/bifonia-pt-homographs-wild`` (real Wikipedia + web sentences, one
meaning labelled per sentence). It is downloaded on demand, so no third-party text
lives in this repository. This is the honest generalisation number — synthetic
benchmarks (``benchmark_tagger.py``) run several points higher because their train
and test sentences share phrasing.

Scores: most-common · rules (no corpus) · Naive-Bayes · perceptron · shipped ensemble.

Usage::

    pip install huggingface_hub
    python benchmark_ood.py
"""
import json
import pathlib
from collections import Counter, defaultdict

from bifonia import tokenize, guess_sense
from bifonia.data import POS_SENSES
from bifonia.scoring import guess_pos as _rule_pos, resolve_sense as _rule_resolve
from bifonia.model import SenseModel, NB_PATH, PERCEPTRON_PATH

ROOT = pathlib.Path(__file__).parent
TRAIN = ROOT / "hf" / "train.jsonl"
HF_REPO = "TigreGotico/bifonia-pt-homographs-wild"
TAG_CACHE = ROOT / "scratch" / "ood_tagger_cache.json"


def _load_ood():
    from huggingface_hub import hf_hub_download
    path = hf_hub_download(HF_REPO, "test.jsonl", repo_type="dataset")
    return [json.loads(l) for l in pathlib.Path(path).read_text(encoding="utf-8").splitlines() if l.strip()]


def _tag(recs):
    """{sentence: {spacy, stanza}} for the homograph token; cached to scratch/."""
    import warnings
    warnings.filterwarnings("ignore")
    words = {r["sentence"]: r["word"] for r in recs}
    if TAG_CACHE.exists():
        cache = json.loads(TAG_CACHE.read_text(encoding="utf-8"))
        if all(s in cache for s in words):
            return cache
    cache = {s: {} for s in words}
    sents = list(words)

    def tok_pos(tagged, w):
        for t, p in tagged:
            if t.lower() == w:
                return p
        return None

    try:
        import spacy
        nlp = spacy.load("pt_core_news_lg", disable=["parser", "ner", "lemmatizer"])
        for i, doc in enumerate(nlp.pipe(sents, batch_size=512)):
            cache[sents[i]]["spacy"] = tok_pos([(t.text, t.pos_) for t in doc], words[sents[i]])
    except Exception as e:
        print(f"  (spaCy unavailable: {e})")
    try:
        import stanza
        nlp_s = stanza.Pipeline("pt", processors="tokenize,pos", verbose=False,
                                tokenize_no_ssplit=True)
        for k in range(0, len(sents), 256):
            batch = sents[k:k + 256]
            for s, doc in zip(batch, nlp_s.bulk_process(
                    [stanza.Document([], text=s) for s in batch])):
                tagged = [(w.text, w.upos) for sent in doc.sentences for w in sent.words]
                cache[s]["stanza"] = tok_pos(tagged, words[s])
    except Exception as e:
        print(f"  (Stanza unavailable: {e})")
    if TAG_CACHE.parent.exists():
        TAG_CACHE.write_text(json.dumps(cache, ensure_ascii=False), encoding="utf-8")
    return cache


def main():
    recs = _load_ood()
    train = [json.loads(l) for l in TRAIN.read_text(encoding="utf-8").splitlines() if l.strip()]

    freq = defaultdict(Counter)
    for r in train:
        freq[r["word"]][r["sense"]] += 1
    most_common = {w: c.most_common(1)[0][0] for w, c in freq.items()}

    def pos_to_sense(word, upos):
        cands = POS_SENSES.get(word, {}).get(upos)
        if cands:
            return sorted(cands, key=lambda s: -freq[word][s])[0]
        return most_common.get(word)

    print("tagging OOD sentences (spaCy + Stanza)…")
    tags = _tag(recs)

    nb = SenseModel.load(str(NB_PATH))
    perc = SenseModel.load(str(PERCEPTRON_PATH))

    def toks(r):
        t = tokenize(r["sentence"].lower())
        return (t, t.index(r["word"])) if r["word"] in t else (None, None)

    def rules(r):
        t, i = toks(r)
        return _rule_resolve(r["word"], t, i, _rule_pos(t, i)) if t else None

    def model(m, r):
        t, i = toks(r)
        return m.predict(r["word"], t, i) if t and m.has(r["word"]) else None

    def shipped(r):
        t, i = toks(r)
        return guess_sense(t, i) if t else None

    appr = {
        "most-common": lambda r: most_common.get(r["word"]),
        "spaCy": lambda r: pos_to_sense(r["word"], tags.get(r["sentence"], {}).get("spacy")),
        "Stanza": lambda r: pos_to_sense(r["word"], tags.get(r["sentence"], {}).get("stanza")),
        "rules(free)": rules,
        "NB": lambda r: model(nb, r),
        "perceptron": lambda r: model(perc, r),
        "shipped": shipped,
    }

    print(f"OOD set: {len(recs)} real sentences, {len({r['word'] for r in recs})} words\n")
    print(f"{'approach':<14} accuracy")
    for name, fn in appr.items():
        ok = sum(1 for r in recs if fn(r) == r["sense"])
        print(f"{name:<14} {ok / len(recs) * 100:6.2f}%  ({ok}/{len(recs)})")

    print("\nper-word (n | rules | NB | perceptron | shipped):")
    byw = defaultdict(list)
    for r in recs:
        byw[r["word"]].append(r)
    for w in sorted(byw):
        rs = byw[w]
        n = len(rs)

        def acc(fn):
            return sum(1 for r in rs if fn(r) == r["sense"]) / n * 100
        print(f"  {w:<12} {n:>3} | {acc(rules):5.0f} | {acc(lambda r: model(nb, r)):5.0f} | "
              f"{acc(lambda r: model(perc, r)):5.0f} | {acc(shipped):5.0f}")


if __name__ == "__main__":
    main()