Skip to content

Commit 79d02f4

Browse files
JarbasAlclaude
andcommitted
feat: implement G2PPlugin interface and add differential audit against orthography2ipa
- Add orthography2ipa>=0.3.0a1 to project dependencies - MirandesePhonemizer now subclasses G2PPlugin; adds transcribe(), transcribe_word(word, context) and language_codes property; all eight backends inherit conformance with no changes to their existing public API - Add DIALECT_TO_SPEC_CODE mapping and spec_for(dialect) helper in base.py; RAIANO maps to base mwl (no mwl-x-raiano spec yet, gated on M0) - Add tests/test_orthography2ipa_interface.py: interface-conformance suite for all backends plus a differential audit comparing g2p.json candidates against orthography2ipa mwl spec graphemes (agree=41, differ=15, gate: agree>=differ) Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
1 parent 7be5932 commit 79d02f4

3 files changed

Lines changed: 229 additions & 3 deletions

File tree

mwl_phonemizer/base.py

Lines changed: 74 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,59 @@
1-
import abc
21
import json
32
import re
43
import os
54
from collections import Counter
5+
from typing import List, Optional
66
import editdistance
77
from enum import Enum
88

9+
import orthography2ipa
10+
from orthography2ipa.g2p_plugin import G2PPlugin, WordContext
11+
12+
913
class Dialects(str, Enum):
1014
CENTRAL = "central"
1115
RAIANO = "raiano"
1216
SENDINESE = "sendinese"
1317

1418

15-
class MirandesePhonemizer:
19+
# ---------------------------------------------------------------------------
20+
# Dialect ↔ orthography2ipa spec mapping
21+
# ---------------------------------------------------------------------------
22+
# CENTRAL and RAIANO both map to "mwl" (the base Mirandese spec).
23+
# A dedicated "mwl-x-raiano" spec does not yet exist in orthography2ipa;
24+
# the mapping will be updated once M0 seeds that dialect upstream.
25+
# SENDINESE maps to "mwl-x-sendim" (Miranda do Douro / Sendim sub-variety).
26+
DIALECT_TO_SPEC_CODE: dict = {
27+
Dialects.CENTRAL: "mwl",
28+
Dialects.RAIANO: "mwl", # no mwl-x-raiano spec yet; falls back to base
29+
Dialects.SENDINESE: "mwl-x-sendim",
30+
}
31+
32+
33+
def spec_for(dialect: Dialects):
34+
"""Return the ``orthography2ipa`` LanguageSpec for *dialect*.
35+
36+
RAIANO resolves to the base ``mwl`` spec because a dedicated
37+
``mwl-x-raiano`` entry has not yet been seeded upstream (Phase M0).
38+
"""
39+
code = DIALECT_TO_SPEC_CODE[dialect]
40+
return orthography2ipa.get(code)
41+
42+
43+
# ---------------------------------------------------------------------------
44+
# Base phonemizer — implements the shared G2PPlugin interface
45+
# ---------------------------------------------------------------------------
46+
47+
class MirandesePhonemizer(G2PPlugin):
48+
"""Mirandese G2P base class.
49+
50+
Implements the shared :class:`orthography2ipa.g2p_plugin.G2PPlugin`
51+
interface so all eight concrete backends gain conformance for free.
52+
The existing public API (``phonemize`` / ``phonemize_sentence``) is
53+
preserved unchanged; ``transcribe`` and ``transcribe_word`` are thin
54+
wrappers that delegate to it.
55+
"""
56+
1657
def __init__(self,
1758
gold_dict: str | None = None,
1859
raiano_dict: str | None = None, # dialect exceptions
@@ -32,6 +73,37 @@ def __init__(self,
3273
with open(sendinese_dict, "r", encoding="utf-8") as f:
3374
self.SENDINESE_GOLD = {k: self.strip_markers(v) for k, v in json.load(f).items()}
3475

76+
# ------------------------------------------------------------------
77+
# G2PPlugin interface
78+
# ------------------------------------------------------------------
79+
80+
@property
81+
def language_codes(self) -> List[str]:
82+
"""BCP-47 codes handled by this instance.
83+
84+
Always includes ``"mwl"`` (base Mirandese). When the instance is
85+
configured for a dialect that has a distinct private-use sub-tag
86+
(currently only SENDINESE → ``"mwl-x-sendim"``) that code is
87+
appended as well.
88+
"""
89+
codes = ["mwl"]
90+
dialect_code = DIALECT_TO_SPEC_CODE.get(self.dialect)
91+
if dialect_code and dialect_code != "mwl":
92+
codes.append(dialect_code)
93+
return codes
94+
95+
def transcribe(self, text: str) -> str:
96+
"""Transcribe a full sentence to IPA. Delegates to ``phonemize_sentence``."""
97+
return self.phonemize_sentence(text)
98+
99+
def transcribe_word(self, word: str, context: Optional[WordContext] = None) -> str:
100+
"""Transcribe a single word to IPA. Delegates to ``phonemize``."""
101+
return self.phonemize(word)
102+
103+
# ------------------------------------------------------------------
104+
# Original public API (unchanged)
105+
# ------------------------------------------------------------------
106+
35107
def phonemize(self, word: str, lookup_word: bool = True) -> str:
36108
if lookup_word and word.lower() in self.GOLD:
37109
return self.GOLD[word.lower()]
@@ -121,4 +193,3 @@ def evaluate_on_gold(self, limit=None, detailed=False, show_changes=False):
121193
"details": details
122194
}
123195
return result
124-

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ dependencies = [
1515
"sklearn_crfsuite",
1616
"editdistance",
1717
"joblib",
18+
"orthography2ipa>=0.3.0a1",
1819
]
1920
keywords = ["mirandese", "mwl", "phonemizer", "g2p", "ipa", "tts"]
2021
classifiers = [
Lines changed: 154 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,154 @@
1+
"""Tests for the orthography2ipa interface conformance and differential audit.
2+
3+
Validates:
4+
- MirandesePhonemizer (and all backends) implement G2PPlugin
5+
- transcribe() == phonemize_sentence(), transcribe_word() == phonemize()
6+
- language_codes per dialect
7+
- differential audit: g2p.json grapheme candidates vs orthography2ipa mwl spec
8+
(informational gate — agree >= differ; known divergences: j ʒ-vs-ʝ, ç s̻-vs-t͡s)
9+
"""
10+
import json
11+
import os
12+
13+
import pytest
14+
15+
import orthography2ipa
16+
from orthography2ipa.g2p_plugin import G2PPlugin, WordContext
17+
18+
from mwl_phonemizer.base import Dialects, MirandesePhonemizer, spec_for, DIALECT_TO_SPEC_CODE
19+
from mwl_phonemizer.orthography_hand_rules import OrthographyRulesMWL
20+
from mwl_phonemizer.char_lookup_mwl import LookupTableMWL
21+
from mwl_phonemizer.ngram_mwl import NgramMWLPhonemizer
22+
from mwl_phonemizer.crf_mwl import CRFPhonemizer
23+
from mwl_phonemizer.crf_ortho_mwl import CRFOrthoCorrector
24+
25+
26+
# ---------------------------------------------------------------------------
27+
# Interface conformance
28+
# ---------------------------------------------------------------------------
29+
30+
class TestInterface:
31+
"""All concrete backends must implement the shared G2PPlugin base."""
32+
33+
@pytest.fixture(params=[
34+
"rules",
35+
"lookup",
36+
"ngram",
37+
"crf",
38+
"crf_ortho",
39+
])
40+
def plugin(self, request):
41+
mapping = {
42+
"rules": OrthographyRulesMWL,
43+
"lookup": LookupTableMWL,
44+
"ngram": NgramMWLPhonemizer,
45+
"crf": CRFPhonemizer,
46+
"crf_ortho": CRFOrthoCorrector,
47+
}
48+
return mapping[request.param]()
49+
50+
def test_implements_g2p_plugin(self, plugin):
51+
assert isinstance(plugin, G2PPlugin)
52+
53+
def test_language_codes_contains_mwl(self, plugin):
54+
assert "mwl" in plugin.language_codes
55+
56+
def test_transcribe_equals_phonemize_sentence(self, plugin):
57+
text = "mui bien"
58+
assert plugin.transcribe(text) == plugin.phonemize_sentence(text)
59+
60+
def test_transcribe_word_equals_phonemize(self, plugin):
61+
word = "bien"
62+
assert plugin.transcribe_word(word) == plugin.phonemize(word)
63+
64+
def test_transcribe_word_accepts_context(self, plugin):
65+
ctx = WordContext(prev_word="mui", next_word=None)
66+
result = plugin.transcribe_word("bien", context=ctx)
67+
assert isinstance(result, str)
68+
69+
70+
class TestLanguageCodes:
71+
def test_central_dialect_codes(self):
72+
p = OrthographyRulesMWL(dialect=Dialects.CENTRAL)
73+
assert p.language_codes == ["mwl"]
74+
75+
def test_raiano_dialect_codes(self):
76+
p = OrthographyRulesMWL(dialect=Dialects.RAIANO)
77+
# RAIANO has no dedicated spec yet; maps to base mwl
78+
assert p.language_codes == ["mwl"]
79+
80+
def test_sendinese_dialect_codes(self):
81+
p = OrthographyRulesMWL(dialect=Dialects.SENDINESE)
82+
assert "mwl" in p.language_codes
83+
assert "mwl-x-sendim" in p.language_codes
84+
85+
86+
# ---------------------------------------------------------------------------
87+
# Dialect ↔ spec helper
88+
# ---------------------------------------------------------------------------
89+
90+
class TestSpecFor:
91+
def test_central_returns_mwl_spec(self):
92+
spec = spec_for(Dialects.CENTRAL)
93+
assert spec.code == "mwl"
94+
95+
def test_sendinese_returns_sendim_spec(self):
96+
spec = spec_for(Dialects.SENDINESE)
97+
assert spec.code == "mwl-x-sendim"
98+
99+
def test_raiano_returns_mwl_spec(self):
100+
spec = spec_for(Dialects.RAIANO)
101+
assert spec.code == "mwl"
102+
103+
def test_dialect_to_spec_code_complete(self):
104+
for d in Dialects:
105+
assert d in DIALECT_TO_SPEC_CODE, f"Dialect {d} missing from DIALECT_TO_SPEC_CODE"
106+
107+
108+
# ---------------------------------------------------------------------------
109+
# Differential audit
110+
# ---------------------------------------------------------------------------
111+
112+
class TestDifferentialGraphemes:
113+
"""Audit g2p.json grapheme candidates against the orthography2ipa mwl spec.
114+
115+
Informational gate: the local g2p.json table stays authoritative;
116+
this test pins the agreement level so drift on either side is visible.
117+
118+
Known divergences being adjudicated upstream (Phase M0):
119+
- j: g2p.json has ʒ, spec has ʝ
120+
- ç: g2p.json has z̻, spec has t͡s (Mirandese affricate vs. fricative)
121+
These are expected and do NOT cause a test failure as long as agree >= differ.
122+
"""
123+
124+
def test_graphemes_agree_with_spec(self, capsys):
125+
g2p_path = os.path.join(os.path.dirname(__file__), "..", "mwl_phonemizer", "g2p.json")
126+
with open(g2p_path, encoding="utf-8") as f:
127+
g2p = json.load(f)
128+
129+
spec = orthography2ipa.get("mwl")
130+
agree, differ = [], []
131+
132+
for grapheme, candidates in g2p.items():
133+
spec_candidates = spec.graphemes.get(grapheme)
134+
if spec_candidates is None:
135+
continue # multi-char or word-specific entry not in spec — skip
136+
# agree if at least one candidate appears in the spec set
137+
shared = [c for c in candidates if c in spec_candidates]
138+
if shared:
139+
agree.append((grapheme, candidates, spec_candidates))
140+
else:
141+
differ.append((grapheme, candidates, spec_candidates))
142+
143+
with capsys.disabled():
144+
print(f"\n[differential audit] agree={len(agree)} differ={len(differ)}")
145+
if differ:
146+
print("Divergences (g2p.json vs spec):")
147+
for g, local, remote in differ:
148+
print(f" {g!r:12s} local={local} spec={remote}")
149+
150+
assert len(agree) >= len(differ), (
151+
f"g2p.json and the mwl spec diverged too much: "
152+
f"agree={len(agree)}, differ={len(differ)}\n"
153+
f"Divergences: {[(g, l, r) for g, l, r in differ]}"
154+
)

0 commit comments

Comments
 (0)