1313# limitations under the License.
1414
1515import pathlib
16- import re
1716import unicodedata
1817from collections import defaultdict
1918from typing import Dict , List , Optional , Union
2019
2120from nemo .collections .common .tokenizers .text_to_speech .ipa_lexicon import (
22- GRAPHEME_CHARACTER_SETS ,
2321 get_grapheme_character_set ,
2422 get_ipa_punctuation_list ,
2523)
@@ -44,13 +42,12 @@ class KannadaG2p(BaseG2p):
4442 ['k', 'a', 'n', 'n', 'a', 'ɖ', 'a']
4543 """
4644
47-
4845 def __init__ (
4946 self ,
5047 phoneme_dict : Optional [Union [str , pathlib .Path , Dict [str , List [str ]]]] = None ,
5148 phoneme_prefix : str = "" ,
52- ascii_letter_prefix : str = "" ,
53- ascii_letter_case : str = "lower" ,
49+ grapheme_prefix : str = "" ,
50+ grapheme_case : str = "lower" ,
5451 word_tokenize_func = None ,
5552 apply_to_oov_word = None ,
5653 mapping_file : Optional [str ] = None ,
@@ -59,12 +56,12 @@ def __init__(
5956
6057 Args:
6158 phoneme_dict: Path to Kannada pronunciation dictionary file or a dict object.
62- Format: word<whitespace>phonemes (space-separated IPA symbols )
59+ Format: word<TAB>pronunciation ( IPA characters without spaces )
6360 phoneme_prefix: Prefix to prepend to phoneme symbols to distinguish from graphemes.
6461 Default is "" (no prefix).
65- ascii_letter_prefix : Prefix to prepend to ASCII letters for code-mixed text.
66- Default is "" (no prefix).
67- ascii_letter_case : Case for ASCII letters : "upper", "lower", or "mixed".
62+ grapheme_prefix : Prefix to prepend to graphemes ( ASCII letters in code-mixed text)
63+ to distinguish them from phonemes. Default is "" (no prefix).
64+ grapheme_case : Case for graphemes : "upper", "lower", or "mixed".
6865 Default is "lower".
6966 word_tokenize_func: Custom function for tokenizing text into words.
7067 Should return List[Tuple[Union[str, List[str]], bool]].
@@ -74,30 +71,31 @@ def __init__(
7471 """
7572 if phoneme_prefix is None :
7673 phoneme_prefix = ""
77- if ascii_letter_prefix is None :
78- ascii_letter_prefix = ""
74+ if grapheme_prefix is None :
75+ grapheme_prefix = ""
7976
8077 self .phoneme_prefix = phoneme_prefix
78+ self .grapheme_prefix = grapheme_prefix
79+ self .grapheme_case = grapheme_case
8180
8281 # Load phoneme dictionary if provided
8382 if phoneme_dict is not None :
84- phoneme_dict = (
85- self ._parse_phoneme_dict (phoneme_dict , phoneme_prefix )
86- if isinstance ( phoneme_dict , ( str , pathlib . Path ))
87- else phoneme_dict
88- )
83+ if isinstance ( phoneme_dict , ( str , pathlib . Path )):
84+ phoneme_dict = self ._parse_phoneme_dict (phoneme_dict , phoneme_prefix )
85+ else :
86+ # Normalize dict input: split string pronunciations into character lists
87+ phoneme_dict = self . _normalize_phoneme_dict ( phoneme_dict , phoneme_prefix )
8988 self .phoneme_list = sorted ({pron for prons in phoneme_dict .values () for pron in prons })
9089 else :
9190 phoneme_dict = {}
9291 self .phoneme_list = []
9392
94- # ASCII letter handling for code-mixed text (Kannada + English)
95- self .ascii_letter_dict = {
96- x : ascii_letter_prefix + x
97- for x in get_grapheme_character_set (locale = "en-US" , case = ascii_letter_case )
93+ # Grapheme handling for code-mixed text (Kannada + English)
94+ self .grapheme_dict = {
95+ x : grapheme_prefix + x
96+ for x in get_grapheme_character_set (locale = "en-US" , case = grapheme_case )
9897 }
99- self .ascii_letter_list = sorted (self .ascii_letter_dict )
100- self .ascii_letter_case = ascii_letter_case
98+ self .grapheme_list = sorted (self .grapheme_dict )
10199
102100 # Punctuation set
103101 self .punctuation = get_ipa_punctuation_list ('kn-IN' )
@@ -120,6 +118,123 @@ def __init__(
120118 mapping_file = mapping_file ,
121119 )
122120
121+ # Build symbols set for IPATokenizer compatibility
122+ self ._build_symbols ()
123+
124+ def _build_symbols (self ):
125+ """Build the symbols set containing all valid graphemes and phonemes.
126+
127+ This is required for compatibility with IPATokenizer which uses g2p.symbols.
128+ """
129+ from nemo .collections .common .tokenizers .text_to_speech .ipa_lexicon import (
130+ IPA_CHARACTER_SETS ,
131+ )
132+
133+ symbols = set ()
134+ prefix = self .phoneme_prefix
135+
136+ # Add Kannada graphemes
137+ symbols .update (self .kannada_grapheme_set )
138+
139+ # Add IPA phonemes from the character set (with prefix if set)
140+ for char in IPA_CHARACTER_SETS .get ("kn-IN" , ()):
141+ symbols .add (prefix + char )
142+
143+ # Add phonemes from dictionary (already prefixed during parsing)
144+ symbols .update (self .phoneme_list )
145+
146+ # Add graphemes for code-mixed text (use dict values which include prefix)
147+ symbols .update (self .grapheme_dict .values ())
148+
149+ # Add punctuation
150+ symbols .update (self .punctuation )
151+
152+ # Add ASCII digits (emitted by G2P for both Kannada and ASCII digits)
153+ symbols .update ('0123456789' )
154+
155+ self .symbols = symbols
156+
157+ @staticmethod
158+ def _normalize_phoneme_dict (
159+ phoneme_dict : Dict [str , List [str ]],
160+ phoneme_prefix : str
161+ ) -> Dict [str , List [str ]]:
162+ """Normalize a dict-provided phoneme dictionary.
163+
164+ Supports two input formats:
165+ 1. {"word": ["phonemestring"]} - string gets split into characters
166+ 2. {"word": ["p", "h", "o", ...]} - already flat, used as-is
167+
168+ Args:
169+ phoneme_dict: Dictionary mapping words to pronunciations.
170+ phoneme_prefix: Prefix to add to each phoneme character.
171+
172+ Returns:
173+ Normalized dictionary with pronunciations as flat character lists.
174+ """
175+ normalized = {}
176+ for word , prons in phoneme_dict .items ():
177+ if not isinstance (prons , list ) or len (prons ) == 0 :
178+ normalized [word ] = prons
179+ continue
180+
181+ # Detect format: flat list of tokens vs list containing pronunciation string(s)
182+ # Flat format: ["k", "a", "n", ...] - all single chars
183+ # String format: ["kannaɖa"] - one or more multi-char strings
184+ is_flat_token_list = all (
185+ isinstance (p , str ) and len (p ) == 1 for p in prons
186+ )
187+
188+ if is_flat_token_list :
189+ # Already flat list of single-char tokens, just apply prefix
190+ if phoneme_prefix :
191+ normalized [word ] = [phoneme_prefix + p for p in prons ]
192+ else :
193+ normalized [word ] = prons
194+ else :
195+ # List contains pronunciation string(s), take first and split
196+ pron = prons [0 ]
197+ if isinstance (pron , str ):
198+ normalized [word ] = [phoneme_prefix + char for char in pron ]
199+ else :
200+ normalized [word ] = prons
201+
202+ return normalized
203+
204+ def replace_symbols (self , symbols , keep_alternate = True ):
205+ """Replace the vocabulary of symbols and filter entries with illegal symbols.
206+
207+ This method is required for compatibility with IPATokenizer's fixed_vocab feature.
208+
209+ Args:
210+ symbols: User-provided set of valid symbols (graphemes and phonemes).
211+ keep_alternate: Unused, kept for API compatibility with IpaG2p.
212+ """
213+ new_symbols = set (symbols )
214+
215+ # Filter phoneme dictionary entries
216+ deletion_words = []
217+
218+ for word , prons in self .phoneme_dict .items ():
219+ # Check for illegal graphemes in the word
220+ word_graphemes = set (word )
221+ if word_graphemes - new_symbols :
222+ deletion_words .append (word )
223+ continue
224+
225+ # Check for illegal phonemes in pronunciation
226+ # prons is a flat list of phoneme tokens (possibly prefixed like "#k")
227+ # Check each token as a whole, not character by character
228+ pron_set = set (prons )
229+ if pron_set - new_symbols :
230+ deletion_words .append (word )
231+
232+ # Update dictionary
233+ for word in deletion_words :
234+ del self .phoneme_dict [word ]
235+
236+ self .symbols = new_symbols
237+
123238 def _init_kannada_rules (self ):
124239 """Initialize Kannada grapheme-to-phoneme mapping rules based on Kannada phonology."""
125240
@@ -224,13 +339,13 @@ def _init_kannada_rules(self):
224339
225340 def _split_phoneme (self , phoneme : str , prefix : str ) -> List [str ]:
226341 """Split multi-character phonemes into separate tokens for consistency.
227-
342+
228343 Splits multi-character phonemes into individual characters for consistent tokenization.
229-
344+
230345 Args:
231346 phoneme: The phoneme string to potentially split.
232347 prefix: Prefix to add to each token.
233-
348+
234349 Returns:
235350 List of prefixed phoneme tokens.
236351 """
@@ -409,29 +524,29 @@ def _rule_based_g2p(self, text: str) -> List[str]:
409524 continue
410525
411526 # Handle ASCII letters (code-mixed text)
412- if char .upper () in self .ascii_letter_dict or char .lower () in self .ascii_letter_dict :
413- processed_char = set_grapheme_case (char , case = self .ascii_letter_case )
414- if processed_char in self .ascii_letter_dict :
415- phonemes .append (self .ascii_letter_dict [processed_char ])
527+ if char .upper () in self .grapheme_dict or char .lower () in self .grapheme_dict :
528+ processed_char = set_grapheme_case (char , case = self .grapheme_case )
529+ if processed_char in self .grapheme_dict :
530+ phonemes .append (self .grapheme_dict [processed_char ])
416531 else :
417532 phonemes .append (processed_char )
418533 i += 1
419534 continue
420535
421- # Handle digits (pass through or convert)
422- if char .isdigit ():
423- phonemes .append (char )
424- i += 1
425- continue
426-
427- # Handle Kannada digits
536+ # Handle Kannada digits (must check before isdigit() as isdigit() is True for Kannada digits)
428537 kannada_digits = '೦೧೨೩೪೫೬೭೮೯'
429538 if char in kannada_digits :
430539 # Convert to Arabic numeral
431540 phonemes .append (str (kannada_digits .index (char )))
432541 i += 1
433542 continue
434543
544+ # Handle ASCII digits (pass through)
545+ if char .isascii () and char .isdigit ():
546+ phonemes .append (char )
547+ i += 1
548+ continue
549+
435550 # Handle punctuation
436551 if char in self .punctuation :
437552 phonemes .append (char )
@@ -506,13 +621,35 @@ def __call__(self, text: str) -> List[str]:
506621 text = unicodedata .normalize ('NFC' , text )
507622
508623 # Apply case transformation for ASCII letters
509- text = set_grapheme_case (text , case = self .ascii_letter_case )
624+ text = set_grapheme_case (text , case = self .grapheme_case )
510625
511626 # Tokenize into words
512- tokens = self ._tokenize (text )
627+ if self .word_tokenize_func is not None :
628+ # Custom tokenizer returns List[Tuple[Union[str, List[str]], bool]]
629+ # where bool (without_changes) indicates whether to pass through unchanged
630+ words_and_flags = self .word_tokenize_func (text )
631+ else :
632+ # Default tokenizer returns List[str], convert to expected format
633+ # without_changes=False means process the word
634+ words_and_flags = [([token ], False ) for token in self ._tokenize (text )]
513635
514636 phoneme_seq = []
515- for token in tokens :
637+ for words , without_changes in words_and_flags :
638+ if without_changes :
639+ # Pass through unchanged: prefix ASCII letters (case-normalized), leave others as-is
640+ for word in words :
641+ for char in word :
642+ normalized_char = set_grapheme_case (char , case = self .grapheme_case )
643+ if normalized_char in self .grapheme_dict :
644+ phoneme_seq .append (self .grapheme_dict [normalized_char ])
645+ else :
646+ phoneme_seq .append (char )
647+ continue
648+
649+ # Process the word(s)
650+ assert len (words ) == 1 , f"{ words } should have single item when without_changes is False"
651+ token = words [0 ]
652+
516653 # Skip whitespace tokens
517654 if token .isspace ():
518655 phoneme_seq .append (' ' )
0 commit comments