@@ -81,7 +81,7 @@ def strip_accents(text):
8181 return regex .sub (ACCENTS_PATTERN , "" , text )
8282
8383
84- def to_US_accents (text , scheme = None , UDATTA = "᳓" , SVARITA_NEW = "᳙" , pauses = r"[।॥\n,;]+" , skip_pattern = r"\+\+\+\(.+?\)\+\+\+" ):
84+ def to_US_accents (text , scheme = None , UDATTA = "᳓" , SVARITA_NEW = "᳙" , pauses = r"[।॥\n,;]+" , skip_pattern = r"\+\+\+\(( .+?) \)\+\+\+" ):
8585 """Given text like
8686 ध्रु॒वो॑ऽसि ।
8787 ध्रु॒वो॒॑ऽहँ स॑जा॒तेषु॑ भूयास॒न्
@@ -97,6 +97,8 @@ def to_US_accents(text, scheme=None, UDATTA = "᳓", SVARITA_NEW = "᳙", pauses
9797 if not any (x in text for x in [SVARITA , SANNATARA , "᳚" , "᳛" ]):
9898 # Avoid inserting udattas from the beginning on an invalid (already converted input)
9999 return text
100+ if any (x in text for x in [SVARITA_NEW , UDATTA ]):
101+ return text
100102 text = regex .sub ("[᳚᳛]" , SVARITA , text )
101103 if scheme == None :
102104 from indic_transliteration import sanscript
@@ -106,7 +108,8 @@ def to_US_accents(text, scheme=None, UDATTA = "᳓", SVARITA_NEW = "᳙", pauses
106108 SKIP_PATTERN = regex .compile (skip_pattern )
107109
108110 # Split the text into a list of syllables and other elements.
109- letters = scheme .split_vyanjanas_and_svaras (text , skip_pattern = skip_pattern )
111+ skip_pattern_noncapture = regex .sub (r"(?<=^|[^\\])\(" , "(?:" , skip_pattern )
112+ letters = scheme .split_vyanjanas_and_svaras (text , skip_pattern = skip_pattern_noncapture )
110113 # Example output here - ['स्', "ओ", "+++(=tick)+++", 'ऽ', 'ग्', "न्", "इ॒", "म्", "ए॑", "व्", "अ"]
111114
112115 out_letters = list (letters )
@@ -135,6 +138,10 @@ def to_US_accents(text, scheme=None, UDATTA = "᳓", SVARITA_NEW = "᳙", pauses
135138 # If a syllable has a svarita and the predecessessor has a sannatara, remove both accents and add a svarita_new to the current syllable.
136139 # This rule (e.g., ध्रु॒वो॑ -> ध्रुवो᳕) is a specific substitution that takes precedence.
137140 for index , letter in enumerate (out_letters ):
141+ # Deal with accented text like दु॒श्चरि॑तं॒ in SKIP_PATTERN
142+ if SKIP_PATTERN .fullmatch (letter ):
143+ continue
144+
138145 # If a syllable has a svarita...
139146 if SVARITA in letter and not SANNATARA in letter :
140147 # ...and the predecessor has a sannatara...
@@ -145,6 +152,10 @@ def to_US_accents(text, scheme=None, UDATTA = "᳓", SVARITA_NEW = "᳙", pauses
145152
146153 for index , letter in enumerate (out_letters ):
147154 is_kampa = SVARITA in letter and SANNATARA in letter # Rule 1
155+
156+ # Deal with accented text like दु॒श्चरि॑तं॒ in SKIP_PATTERN
157+ if SKIP_PATTERN .fullmatch (letter ):
158+ continue
148159
149160 # If a syllable has both sannatara and svarita signs (like वो॒॑), replace it's svarita with udAtta, and temporarily keep the sannatara in itself.
150161 if is_kampa :
@@ -156,6 +167,10 @@ def to_US_accents(text, scheme=None, UDATTA = "᳓", SVARITA_NEW = "᳙", pauses
156167
157168 # If a syllable has svarita, mark all preceeding syllables until a sannatara or svarita_new accent or a pause is reached with udAtta; at which point remove any preceding sannatara.
158169 for index , letter in enumerate (out_letters ):
170+ # Deal with accented text like दु॒श्चरि॑तं॒ in SKIP_PATTERN
171+ if SKIP_PATTERN .fullmatch (letter ):
172+ continue
173+
159174 # --- Backward "painting" from a Svarita ---
160175 if not SVARITA in letter :
161176 continue
@@ -179,6 +194,9 @@ def to_US_accents(text, scheme=None, UDATTA = "᳓", SVARITA_NEW = "᳙", pauses
179194
180195 # If a syllable has sannatara, mark all succeeding syllables with udAtta until a svarita is reached or a pause is reached. Remove the triggering sannatara. After this is done for all syllables, there should be no sannatara left.
181196 for index , letter in enumerate (out_letters ):
197+ # Deal with accented text like दु॒श्चरि॑तं॒ in SKIP_PATTERN
198+ if SKIP_PATTERN .fullmatch (letter ):
199+ continue
182200
183201 # --- Forward "painting" from a Sannatara ---
184202 # This applies to simple sannatara only. Kampa's effect is handled above.
@@ -201,6 +219,13 @@ def to_US_accents(text, scheme=None, UDATTA = "᳓", SVARITA_NEW = "᳙", pauses
201219 curr_fwd_index = scheme .get_adjacent_syllable_index (curr_fwd_index , out_letters , + 1 ,
202220 pauses_pattern = PAUSES_PATTERN )
203221
222+ for index , letter in enumerate (out_letters ):
223+ match = SKIP_PATTERN .match (letter )
224+ if match :
225+ replacement = to_US_accents (text = match .group (1 ), scheme = scheme , UDATTA = UDATTA , SVARITA_NEW = SVARITA_NEW , pauses = pauses , skip_pattern = skip_pattern )
226+ out_letters [index ] = letter .replace (match .group (1 ), replacement )
227+
228+
204229 text = scheme .join_strings (out_letters )
205230 text = text .replace (SVARITA , "" ).replace (SANNATARA , "" )
206231 return text
0 commit comments