TigreGotico
diff --git a/‎CHANGELOG.md‎
Lines changed: 13 additions & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 11 additions & 12 deletions b/‎README.md‎
Lines changed: 11 additions & 12 deletions
diff --git a/‎mwl_phonemizer/__init__.py‎
Lines changed: 5 additions & 1 deletion b/‎mwl_phonemizer/__init__.py‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎mwl_phonemizer/char_lookup_mwl.py‎
Lines changed: 158 additions & 0 deletions b/‎mwl_phonemizer/char_lookup_mwl.py‎
Lines changed: 158 additions & 0 deletions
@@ -0,0 +1,13 @@
+# Changelog
+
+## [0.0.3a1](https://github.com/TigreGotico/mwl_phonemizer/tree/0.0.3a1) (2025-10-02)
+
+[Full Changelog](https://github.com/TigreGotico/mwl_phonemizer/compare/0.0.2...0.0.3a1)
+
+**Merged pull requests:**
+
+- Espeak + CRF [\#1](https://github.com/TigreGotico/mwl_phonemizer/pull/1) ([JarbasAl](https://github.com/JarbasAl))
+
+
+
+\* *This Changelog was automatically generated by [github_changelog_generator](https://github.com/github-changelog-generator/github-changelog-generator)*
@@ -20,19 +20,15 @@ This repository contains a Python-based Mirandese phonemizer, designed to conver
 ## **Usage**
 
 ```python
-# pick one
-from mwl_phonemizer.crf_mwl import  CRFPhonemizer
-from mwl_phonemizer.epitran_mwl import EpitranMWL
-from mwl_phonemizer.espeak_mwl import EspeakMWL
-from mwl_phonemizer.ngram_mwl import NgramMWLPhonemizer
-from mwl_phonemizer.orthography_hand_rules import OrthographyRulesMWL
+from mwl_phonemizer.crf_espeak_mwl import CRFEspeakCorrector
+
 
 sample_texts = [
     "Muitas lhénguas ténen proua de ls sous pergaminos antigos, de la lhiteratura screbida hai cientos d'anhos i de scritores hai muito afamados, hoije bandeiras dessas lhénguas. Mas outras hai que nun puoden tener proua de nada desso, cumo ye l causo de la lhéngua mirandesa.",
     "Todos ls seres houmanos nácen lhibres i eiguales an honra i an dreitos. Dotados de rezon i de cuncéncia, dében de se dar bien uns culs outros i cumo armano",
 ]
 
-phonemizer = EpitranMWL()
+phonemizer = CRFEspeakCorrector()
 for text in sample_texts:
     print(f"Original: {text}")
     print(f"Phonemized: {phonemizer.phonemize_sentence(text)}\n")
@@ -68,12 +64,15 @@ print(f"Stress-Agnostic IPA: {stress_agnostic_ipa}")
 ## **Phonemizer Comparison**
 
 | Phonemizer            | PER (Full IPA, Stress) | PER (Stress-Agnostic) | Words Incorrect (ED>0) | Notes                                                     |
-| --------------------- |------------------------|-----------------------| ---------------------- | --------------------------------------------------------- |
-| **CRF**               | 20.25%                 | 20.76%                | 117                    | Character-level CRF trained on aligned word–phoneme pairs |
+|-----------------------|------------------------|-----------------------|------------------------|-----------------------------------------------------------|
+| **Espeak + CRF**      | 59.98% → 3.72%         | 39.51% → 4.26%        | 35                     | Espeak output corrected with a CRF model                  |
+| **Epitran + CRF**     | 51.37% → 16.54%        | 44.89% → 18.97%       | 110                    | Epitran output corrected with a CRF model                 |
+| **CRF**               | 15.36%                 | 17.06%                | 117                    | Character-level CRF trained on aligned word–phoneme pairs |
+| **Orthography Rules** | 39.04%                 | 31.99%                | 136                    | Hand-crafted orthographic rules                           |
+| **Epitran + Rules**   | 51.37% → 47.26%        | 44.89% → 40.07%       | 137                    | Epitran output corrected with hand-crafted rules          |
+| **Espeak + Rules**    | 59.98% → 52.35%        | 39.51% → 30.30%       | 73                     | Espeak output corrected with hand-crafted rules           |
 | **N-gram (n=4)**      | 43.93%                 | 30.98%                | 141                    | Statistical N-gram model for G2P conversion               |
-| **Orthography Rules** | 39.04%                 | 31.99%                | 136                    | Handcrafted orthographic rules for all dialects           |
-| **Epitran**           | 51.37% → 47.26%        | 44.89% → 40.07%       | 145                    | Epitran output corrected with Mirandese-specific rules    |
-| **Espeak**            | 59.98% → 52.35%        | 39.51% → 30.30%       | 145                    | Espeak IPA output corrected with rules                    |
+| **Character lookup**  | 43.84%                 | 36.92%                | 142                    | Simple letter/digraph to phoneme lookup table             |
 
 **Notes:**
 
 
@@ -3,6 +3,10 @@
 from mwl_phonemizer.espeak_mwl import EspeakMWL
 from mwl_phonemizer.ngram_mwl import NgramMWLPhonemizer
 from mwl_phonemizer.orthography_hand_rules import OrthographyRulesMWL
+from mwl_phonemizer.crf_espeak_mwl import CRFEspeakCorrector
+from mwl_phonemizer.crf_epitran_mwl import CRFEpitranCorrector
+from mwl_phonemizer.char_lookup_mwl import LookupTableMWL
+
 
 
 if __name__ == "__main__":
@@ -26,7 +30,7 @@
     L furdes ber, talbéç que stéia muôrto!"""
     ]
 
-    phonemizer = EpitranMWL()
+    phonemizer = CRFEspeakCorrector()
     for text in sample_texts:
         print(f"Original: {text}")
         print(f"Phonemized: {phonemizer.phonemize_sentence(text)}\n")
 
@@ -0,0 +1,158 @@
+from mwl_phonemizer.base import MirandesePhonemizer
+
+
+class LookupTableMWL(MirandesePhonemizer):
+    TILDE = "̃"  # ◌̃
+
+    LETTERS = {
+        "a": ["a", "ɐ"],
+        "b": ["b", "β"],
+        "c": ["k", "s"],
+        "ç": ["s", "z"],
+        "d": ["d", "ð"],
+        "e": ["ɨ"],
+        "é": ["ɛ"],
+        "f": ["f"],
+        "g": ["ɣ"],
+        "h": [""],  # silent
+        "i": ["i", "j"],
+        "j": ["ʒ"],
+        "l": ["l", "ɫ"],
+        "m": ["m", TILDE, ],
+        "n": ["n", "ŋ", TILDE],
+        "o": ["u", "o", "ʊ"],
+        "ó": ["ɔ"],
+        "p": ["p"],
+        "q": ["k"],
+        "r": ["ɾ"],
+        "s": ["s̺", "z̺"],
+        "t": ["t"],
+        "u": ["u", "w", "ũ"],
+        "x": ["ʃ"],
+        "y": ["j"],
+        "z": ["z"],
+
+        "A": ["ɐ̃ŋ"],
+        "E": ["ẽŋ", "ɨ̃"],
+        "I": ["ĩŋ"],
+        "O": ["õŋ"],
+        "R": ["r"],
+        "S": ["s̺"],
+        "U": ["ũŋ", "ʊ̃ŋ"],
+        "Q": ["k"],
+        "G": ["g"],
+        "Ç": ["sɛ", "sɨ"],
+        "C": ["s̻i"],
+        "W": ["wo"],
+        "Z": ["sk"],
+        # "I": ["ɨ̃j̃"],  # SENDINESE
+    }
+
+    @staticmethod
+    def normalize(sentence: str):
+        # normalize short/long pauses to " " and "."
+        sentence = (sentence.lower()
+                    .replace("\t", " ")
+                    .replace("-", " ")
+                    .replace(",", " ")
+                    .replace(";", " ")
+                    .replace(".", ".")
+                    .replace("!", ".")
+                    .replace("?", "."))
+
+        # temp representation of digraphs as individual letters
+        DIMAP = {
+            "an": "A",
+            "en": "E",
+            "in": "I",
+            "on": "O",
+            "un": "U",
+            "rr": "R",
+            "ss": "S",
+            "lh": "ʎ",
+            "nh": "ɲ",
+            "qu": "Q",
+            "gu": "G",
+            "gue": "G",
+            "Ge": "G",
+            "ce": "Ç",
+            "ci": "C",
+            "uo": "W",
+            "çc": "Z",
+            "ge": "ʒɨ",
+        }
+
+        # normalize digraphs
+        for di, n in DIMAP.items():
+            sentence = sentence.replace(di, n)
+        return sentence
+
+    # -------------------------
+    # Phonemizer interface
+    # -------------------------
+    def phonemize(self, word: str, lookup_word: bool = True) -> str:
+        """Phonemize a single Mirandese word via espeak + correction rules."""
+        if lookup_word and word.lower() in self.GOLD:
+            return self.GOLD[word.lower()]
+        word = self.normalize(word)
+        phonemes = ""
+        for idx, char in enumerate(word):
+            if char in self.LETTERS:
+                pho = self.LETTERS[char][0]
+                phonemes += pho
+            else:
+                phonemes += char
+        return phonemes
+
+
+if __name__ == "__main__":
+
+    pho = LookupTableMWL()
+
+    stats = pho.evaluate_on_gold(limit=None, detailed=False, show_changes=False)
+
+    # --- Compute PER (Phoneme Error Rate) ---  # TODO - move this to evaluate_on_gold
+    total_ref_len_stress = sum(len(v) for v in pho.GOLD.values())
+    total_ref_len_no_stress = sum(len(pho.strip_stress(v)) for v in pho.GOLD.values())
+
+    per = stats['avg_edit_distance'] * stats['counts'] / total_ref_len_stress
+
+    per_no_stress = stats['avg_edit_distance_no_stress'] * stats['counts'] / total_ref_len_no_stress
+
+    # --- Print Summary Metrics ---
+    print("\n" + "=" * 50)
+    print("      Mirandese Phonemizer Rule Evaluation")
+    print("=" * 50)
+    print(f"Total Words Evaluated: {stats['counts']}\n")
+
+    print("## Phoneme Error Rate (PER, Full IPA Match, includes stress)")
+    print(f"PER:    {per:.2%}")
+
+    print("\n## Phoneme Error Rate (PER, Stress-Agnostic)")
+    print(f"PER:    {per_no_stress:.2%}")
+
+    # --- Print only 'wrong' words (ED > 0) ---
+    print("\n--- Incorrectly Phonemized Words (Full IPA Match ED > 0) ---")
+    wrong_words = stats.get("details", [])
+
+    if wrong_words:
+        print(f"Total Incorrect: {len(wrong_words)} words\n")
+
+        # Print a header for the detailed list
+        print(f"{'Word':<20} | {'Gold':<15} | {'Phonemized':<15} | {'ED After':<8}")
+        print("-" * 75)
+
+        # Print the detailed list
+        for d in wrong_words:
+            print(
+                f"{d['word']:<20} | {d['gold']:<15} | {d['phonemes']:<15} | {d['ed']:<8}")
+    else:
+        print("All words achieved an exact match (100% Accuracy)!")
+
+    sample_texts = [
+        "Muitas lhénguas ténen proua de ls sous pergaminos antigos, de la lhiteratura screbida hai cientos d'anhos i de scritores hai muito afamados, hoije bandeiras dessas lhénguas. Mas outras hai que nun puoden tener proua de nada desso, cumo ye l causo de la lhéngua mirandesa.",
+        "Todos ls seres houmanos nácen lhibres i eiguales an honra i an dreitos. Dotados de rezon i de cuncéncia, dében de se dar bien uns culs outros i cumo armano",
+        "Hai más fuogo alhá, i ye deimingo!"
+    ]
+    for t in sample_texts:
+        print(pho.phonemize_sentence(t))