Skip to content

Commit d67c70b

Browse files
author
Jason Roche
committed
fix review comments
Signed-off-by: Jason Roche <jas.tech23@gmail.com>
1 parent aec0b40 commit d67c70b

2 files changed

Lines changed: 294 additions & 39 deletions

File tree

nemo/collections/tts/g2p/models/kn_in_ipa.py

Lines changed: 176 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -13,13 +13,11 @@
1313
# limitations under the License.
1414

1515
import pathlib
16-
import re
1716
import unicodedata
1817
from collections import defaultdict
1918
from typing import Dict, List, Optional, Union
2019

2120
from nemo.collections.common.tokenizers.text_to_speech.ipa_lexicon import (
22-
GRAPHEME_CHARACTER_SETS,
2321
get_grapheme_character_set,
2422
get_ipa_punctuation_list,
2523
)
@@ -44,13 +42,12 @@ class KannadaG2p(BaseG2p):
4442
['k', 'a', 'n', 'n', 'a', 'ɖ', 'a']
4543
"""
4644

47-
4845
def __init__(
4946
self,
5047
phoneme_dict: Optional[Union[str, pathlib.Path, Dict[str, List[str]]]] = None,
5148
phoneme_prefix: str = "",
52-
ascii_letter_prefix: str = "",
53-
ascii_letter_case: str = "lower",
49+
grapheme_prefix: str = "",
50+
grapheme_case: str = "lower",
5451
word_tokenize_func=None,
5552
apply_to_oov_word=None,
5653
mapping_file: Optional[str] = None,
@@ -59,12 +56,12 @@ def __init__(
5956
6057
Args:
6158
phoneme_dict: Path to Kannada pronunciation dictionary file or a dict object.
62-
Format: word<whitespace>phonemes (space-separated IPA symbols)
59+
Format: word<TAB>pronunciation (IPA characters without spaces)
6360
phoneme_prefix: Prefix to prepend to phoneme symbols to distinguish from graphemes.
6461
Default is "" (no prefix).
65-
ascii_letter_prefix: Prefix to prepend to ASCII letters for code-mixed text.
66-
Default is "" (no prefix).
67-
ascii_letter_case: Case for ASCII letters: "upper", "lower", or "mixed".
62+
grapheme_prefix: Prefix to prepend to graphemes (ASCII letters in code-mixed text)
63+
to distinguish them from phonemes. Default is "" (no prefix).
64+
grapheme_case: Case for graphemes: "upper", "lower", or "mixed".
6865
Default is "lower".
6966
word_tokenize_func: Custom function for tokenizing text into words.
7067
Should return List[Tuple[Union[str, List[str]], bool]].
@@ -74,30 +71,31 @@ def __init__(
7471
"""
7572
if phoneme_prefix is None:
7673
phoneme_prefix = ""
77-
if ascii_letter_prefix is None:
78-
ascii_letter_prefix = ""
74+
if grapheme_prefix is None:
75+
grapheme_prefix = ""
7976

8077
self.phoneme_prefix = phoneme_prefix
78+
self.grapheme_prefix = grapheme_prefix
79+
self.grapheme_case = grapheme_case
8180

8281
# Load phoneme dictionary if provided
8382
if phoneme_dict is not None:
84-
phoneme_dict = (
85-
self._parse_phoneme_dict(phoneme_dict, phoneme_prefix)
86-
if isinstance(phoneme_dict, (str, pathlib.Path))
87-
else phoneme_dict
88-
)
83+
if isinstance(phoneme_dict, (str, pathlib.Path)):
84+
phoneme_dict = self._parse_phoneme_dict(phoneme_dict, phoneme_prefix)
85+
else:
86+
# Normalize dict input: split string pronunciations into character lists
87+
phoneme_dict = self._normalize_phoneme_dict(phoneme_dict, phoneme_prefix)
8988
self.phoneme_list = sorted({pron for prons in phoneme_dict.values() for pron in prons})
9089
else:
9190
phoneme_dict = {}
9291
self.phoneme_list = []
9392

94-
# ASCII letter handling for code-mixed text (Kannada + English)
95-
self.ascii_letter_dict = {
96-
x: ascii_letter_prefix + x
97-
for x in get_grapheme_character_set(locale="en-US", case=ascii_letter_case)
93+
# Grapheme handling for code-mixed text (Kannada + English)
94+
self.grapheme_dict = {
95+
x: grapheme_prefix + x
96+
for x in get_grapheme_character_set(locale="en-US", case=grapheme_case)
9897
}
99-
self.ascii_letter_list = sorted(self.ascii_letter_dict)
100-
self.ascii_letter_case = ascii_letter_case
98+
self.grapheme_list = sorted(self.grapheme_dict)
10199

102100
# Punctuation set
103101
self.punctuation = get_ipa_punctuation_list('kn-IN')
@@ -120,6 +118,123 @@ def __init__(
120118
mapping_file=mapping_file,
121119
)
122120

121+
# Build symbols set for IPATokenizer compatibility
122+
self._build_symbols()
123+
124+
def _build_symbols(self):
125+
"""Build the symbols set containing all valid graphemes and phonemes.
126+
127+
This is required for compatibility with IPATokenizer which uses g2p.symbols.
128+
"""
129+
from nemo.collections.common.tokenizers.text_to_speech.ipa_lexicon import (
130+
IPA_CHARACTER_SETS,
131+
)
132+
133+
symbols = set()
134+
prefix = self.phoneme_prefix
135+
136+
# Add Kannada graphemes
137+
symbols.update(self.kannada_grapheme_set)
138+
139+
# Add IPA phonemes from the character set (with prefix if set)
140+
for char in IPA_CHARACTER_SETS.get("kn-IN", ()):
141+
symbols.add(prefix + char)
142+
143+
# Add phonemes from dictionary (already prefixed during parsing)
144+
symbols.update(self.phoneme_list)
145+
146+
# Add graphemes for code-mixed text (use dict values which include prefix)
147+
symbols.update(self.grapheme_dict.values())
148+
149+
# Add punctuation
150+
symbols.update(self.punctuation)
151+
152+
# Add ASCII digits (emitted by G2P for both Kannada and ASCII digits)
153+
symbols.update('0123456789')
154+
155+
self.symbols = symbols
156+
157+
@staticmethod
158+
def _normalize_phoneme_dict(
159+
phoneme_dict: Dict[str, List[str]],
160+
phoneme_prefix: str
161+
) -> Dict[str, List[str]]:
162+
"""Normalize a dict-provided phoneme dictionary.
163+
164+
Supports two input formats:
165+
1. {"word": ["phonemestring"]} - string gets split into characters
166+
2. {"word": ["p", "h", "o", ...]} - already flat, used as-is
167+
168+
Args:
169+
phoneme_dict: Dictionary mapping words to pronunciations.
170+
phoneme_prefix: Prefix to add to each phoneme character.
171+
172+
Returns:
173+
Normalized dictionary with pronunciations as flat character lists.
174+
"""
175+
normalized = {}
176+
for word, prons in phoneme_dict.items():
177+
if not isinstance(prons, list) or len(prons) == 0:
178+
normalized[word] = prons
179+
continue
180+
181+
# Detect format: flat list of tokens vs list containing pronunciation string(s)
182+
# Flat format: ["k", "a", "n", ...] - all single chars
183+
# String format: ["kannaɖa"] - one or more multi-char strings
184+
is_flat_token_list = all(
185+
isinstance(p, str) and len(p) == 1 for p in prons
186+
)
187+
188+
if is_flat_token_list:
189+
# Already flat list of single-char tokens, just apply prefix
190+
if phoneme_prefix:
191+
normalized[word] = [phoneme_prefix + p for p in prons]
192+
else:
193+
normalized[word] = prons
194+
else:
195+
# List contains pronunciation string(s), take first and split
196+
pron = prons[0]
197+
if isinstance(pron, str):
198+
normalized[word] = [phoneme_prefix + char for char in pron]
199+
else:
200+
normalized[word] = prons
201+
202+
return normalized
203+
204+
def replace_symbols(self, symbols, keep_alternate=True):
205+
"""Replace the vocabulary of symbols and filter entries with illegal symbols.
206+
207+
This method is required for compatibility with IPATokenizer's fixed_vocab feature.
208+
209+
Args:
210+
symbols: User-provided set of valid symbols (graphemes and phonemes).
211+
keep_alternate: Unused, kept for API compatibility with IpaG2p.
212+
"""
213+
new_symbols = set(symbols)
214+
215+
# Filter phoneme dictionary entries
216+
deletion_words = []
217+
218+
for word, prons in self.phoneme_dict.items():
219+
# Check for illegal graphemes in the word
220+
word_graphemes = set(word)
221+
if word_graphemes - new_symbols:
222+
deletion_words.append(word)
223+
continue
224+
225+
# Check for illegal phonemes in pronunciation
226+
# prons is a flat list of phoneme tokens (possibly prefixed like "#k")
227+
# Check each token as a whole, not character by character
228+
pron_set = set(prons)
229+
if pron_set - new_symbols:
230+
deletion_words.append(word)
231+
232+
# Update dictionary
233+
for word in deletion_words:
234+
del self.phoneme_dict[word]
235+
236+
self.symbols = new_symbols
237+
123238
def _init_kannada_rules(self):
124239
"""Initialize Kannada grapheme-to-phoneme mapping rules based on Kannada phonology."""
125240

@@ -224,13 +339,13 @@ def _init_kannada_rules(self):
224339

225340
def _split_phoneme(self, phoneme: str, prefix: str) -> List[str]:
226341
"""Split multi-character phonemes into separate tokens for consistency.
227-
342+
228343
Splits multi-character phonemes into individual characters for consistent tokenization.
229-
344+
230345
Args:
231346
phoneme: The phoneme string to potentially split.
232347
prefix: Prefix to add to each token.
233-
348+
234349
Returns:
235350
List of prefixed phoneme tokens.
236351
"""
@@ -409,29 +524,29 @@ def _rule_based_g2p(self, text: str) -> List[str]:
409524
continue
410525

411526
# Handle ASCII letters (code-mixed text)
412-
if char.upper() in self.ascii_letter_dict or char.lower() in self.ascii_letter_dict:
413-
processed_char = set_grapheme_case(char, case=self.ascii_letter_case)
414-
if processed_char in self.ascii_letter_dict:
415-
phonemes.append(self.ascii_letter_dict[processed_char])
527+
if char.upper() in self.grapheme_dict or char.lower() in self.grapheme_dict:
528+
processed_char = set_grapheme_case(char, case=self.grapheme_case)
529+
if processed_char in self.grapheme_dict:
530+
phonemes.append(self.grapheme_dict[processed_char])
416531
else:
417532
phonemes.append(processed_char)
418533
i += 1
419534
continue
420535

421-
# Handle digits (pass through or convert)
422-
if char.isdigit():
423-
phonemes.append(char)
424-
i += 1
425-
continue
426-
427-
# Handle Kannada digits
536+
# Handle Kannada digits (must check before isdigit() as isdigit() is True for Kannada digits)
428537
kannada_digits = '೦೧೨೩೪೫೬೭೮೯'
429538
if char in kannada_digits:
430539
# Convert to Arabic numeral
431540
phonemes.append(str(kannada_digits.index(char)))
432541
i += 1
433542
continue
434543

544+
# Handle ASCII digits (pass through)
545+
if char.isascii() and char.isdigit():
546+
phonemes.append(char)
547+
i += 1
548+
continue
549+
435550
# Handle punctuation
436551
if char in self.punctuation:
437552
phonemes.append(char)
@@ -506,13 +621,35 @@ def __call__(self, text: str) -> List[str]:
506621
text = unicodedata.normalize('NFC', text)
507622

508623
# Apply case transformation for ASCII letters
509-
text = set_grapheme_case(text, case=self.ascii_letter_case)
624+
text = set_grapheme_case(text, case=self.grapheme_case)
510625

511626
# Tokenize into words
512-
tokens = self._tokenize(text)
627+
if self.word_tokenize_func is not None:
628+
# Custom tokenizer returns List[Tuple[Union[str, List[str]], bool]]
629+
# where bool (without_changes) indicates whether to pass through unchanged
630+
words_and_flags = self.word_tokenize_func(text)
631+
else:
632+
# Default tokenizer returns List[str], convert to expected format
633+
# without_changes=False means process the word
634+
words_and_flags = [([token], False) for token in self._tokenize(text)]
513635

514636
phoneme_seq = []
515-
for token in tokens:
637+
for words, without_changes in words_and_flags:
638+
if without_changes:
639+
# Pass through unchanged: prefix ASCII letters (case-normalized), leave others as-is
640+
for word in words:
641+
for char in word:
642+
normalized_char = set_grapheme_case(char, case=self.grapheme_case)
643+
if normalized_char in self.grapheme_dict:
644+
phoneme_seq.append(self.grapheme_dict[normalized_char])
645+
else:
646+
phoneme_seq.append(char)
647+
continue
648+
649+
# Process the word(s)
650+
assert len(words) == 1, f"{words} should have single item when without_changes is False"
651+
token = words[0]
652+
516653
# Skip whitespace tokens
517654
if token.isspace():
518655
phoneme_seq.append(' ')

0 commit comments

Comments
 (0)