Skip to content

Commit eda70dc

Browse files
committed
sd
1 parent 94c38d8 commit eda70dc

File tree

2 files changed

+28
-3
lines changed

2 files changed

+28
-3
lines changed

indic_transliteration/sanscript/schemes/brahmic/accent.py

Lines changed: 27 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ def strip_accents(text):
8181
return regex.sub(ACCENTS_PATTERN, "", text)
8282

8383

84-
def to_US_accents(text, scheme=None, UDATTA = "᳓", SVARITA_NEW = "᳙", pauses=r"[।॥\n,;]+", skip_pattern=r"\+\+\+\(.+?\)\+\+\+"):
84+
def to_US_accents(text, scheme=None, UDATTA = "᳓", SVARITA_NEW = "᳙", pauses=r"[।॥\n,;]+", skip_pattern=r"\+\+\+\((.+?)\)\+\+\+"):
8585
"""Given text like
8686
ध्रु॒वो॑ऽसि ।
8787
ध्रु॒वो॒॑ऽहँ स॑जा॒तेषु॑ भूयास॒न्
@@ -97,6 +97,8 @@ def to_US_accents(text, scheme=None, UDATTA = "᳓", SVARITA_NEW = "᳙", pauses
9797
if not any(x in text for x in [SVARITA, SANNATARA, "᳚", "᳛"]):
9898
# Avoid inserting udattas from the beginning on an invalid (already converted input)
9999
return text
100+
if any(x in text for x in [SVARITA_NEW, UDATTA]):
101+
return text
100102
text = regex.sub("[᳚᳛]", SVARITA, text)
101103
if scheme == None:
102104
from indic_transliteration import sanscript
@@ -106,7 +108,8 @@ def to_US_accents(text, scheme=None, UDATTA = "᳓", SVARITA_NEW = "᳙", pauses
106108
SKIP_PATTERN = regex.compile(skip_pattern)
107109

108110
# Split the text into a list of syllables and other elements.
109-
letters = scheme.split_vyanjanas_and_svaras(text, skip_pattern=skip_pattern)
111+
skip_pattern_noncapture = regex.sub(r"(?<=^|[^\\])\(", "(?:", skip_pattern)
112+
letters = scheme.split_vyanjanas_and_svaras(text, skip_pattern=skip_pattern_noncapture)
110113
# Example output here - ['स्', "ओ", "+++(=tick)+++", 'ऽ', 'ग्', "न्", "इ॒", "म्", "ए॑", "व्", "अ"]
111114

112115
out_letters = list(letters)
@@ -135,6 +138,10 @@ def to_US_accents(text, scheme=None, UDATTA = "᳓", SVARITA_NEW = "᳙", pauses
135138
# If a syllable has a svarita and the predecessessor has a sannatara, remove both accents and add a svarita_new to the current syllable.
136139
# This rule (e.g., ध्रु॒वो॑ -> ध्रुवो᳕) is a specific substitution that takes precedence.
137140
for index, letter in enumerate(out_letters):
141+
# Deal with accented text like दु॒श्चरि॑तं॒ in SKIP_PATTERN
142+
if SKIP_PATTERN.fullmatch(letter):
143+
continue
144+
138145
# If a syllable has a svarita...
139146
if SVARITA in letter and not SANNATARA in letter:
140147
# ...and the predecessor has a sannatara...
@@ -145,6 +152,10 @@ def to_US_accents(text, scheme=None, UDATTA = "᳓", SVARITA_NEW = "᳙", pauses
145152

146153
for index, letter in enumerate(out_letters):
147154
is_kampa = SVARITA in letter and SANNATARA in letter # Rule 1
155+
156+
# Deal with accented text like दु॒श्चरि॑तं॒ in SKIP_PATTERN
157+
if SKIP_PATTERN.fullmatch(letter):
158+
continue
148159

149160
# If a syllable has both sannatara and svarita signs (like वो॒॑), replace it's svarita with udAtta, and temporarily keep the sannatara in itself.
150161
if is_kampa:
@@ -156,6 +167,10 @@ def to_US_accents(text, scheme=None, UDATTA = "᳓", SVARITA_NEW = "᳙", pauses
156167

157168
# If a syllable has svarita, mark all preceeding syllables until a sannatara or svarita_new accent or a pause is reached with udAtta; at which point remove any preceding sannatara.
158169
for index, letter in enumerate(out_letters):
170+
# Deal with accented text like दु॒श्चरि॑तं॒ in SKIP_PATTERN
171+
if SKIP_PATTERN.fullmatch(letter):
172+
continue
173+
159174
# --- Backward "painting" from a Svarita ---
160175
if not SVARITA in letter:
161176
continue
@@ -179,6 +194,9 @@ def to_US_accents(text, scheme=None, UDATTA = "᳓", SVARITA_NEW = "᳙", pauses
179194

180195
# If a syllable has sannatara, mark all succeeding syllables with udAtta until a svarita is reached or a pause is reached. Remove the triggering sannatara. After this is done for all syllables, there should be no sannatara left.
181196
for index, letter in enumerate(out_letters):
197+
# Deal with accented text like दु॒श्चरि॑तं॒ in SKIP_PATTERN
198+
if SKIP_PATTERN.fullmatch(letter):
199+
continue
182200

183201
# --- Forward "painting" from a Sannatara ---
184202
# This applies to simple sannatara only. Kampa's effect is handled above.
@@ -201,6 +219,13 @@ def to_US_accents(text, scheme=None, UDATTA = "᳓", SVARITA_NEW = "᳙", pauses
201219
curr_fwd_index = scheme.get_adjacent_syllable_index(curr_fwd_index, out_letters, +1,
202220
pauses_pattern=PAUSES_PATTERN)
203221

222+
for index, letter in enumerate(out_letters):
223+
match = SKIP_PATTERN.match(letter)
224+
if match:
225+
replacement = to_US_accents(text=match.group(1), scheme=scheme, UDATTA=UDATTA, SVARITA_NEW=SVARITA_NEW, pauses=pauses, skip_pattern=skip_pattern)
226+
out_letters[index] = letter.replace(match.group(1), replacement)
227+
228+
204229
text = scheme.join_strings(out_letters)
205230
text = text.replace(SVARITA, "").replace(SANNATARA, "")
206231
return text

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@
4444
# Versions should comply with PEP440. For a discussion on single-sourcing
4545
# the version across setup.py and the project code, see
4646
# https://packaging.python.org/en/latest/single_source_version.html
47-
version='2.3.72',
47+
version='2.3.73',
4848

4949

5050
description='Transliteration tools to convert text in one indic script encoding to another',

0 commit comments

Comments
 (0)