Skip to content

Commit e40a317

Browse files
committed
sd
1 parent ae9f4c7 commit e40a317

File tree

5 files changed

+182
-9
lines changed

5 files changed

+182
-9
lines changed

indic_transliteration/sanscript/schemes/brahmic/__init__.py

Lines changed: 30 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,17 @@ def do_vyanjana_svara_join(self, vyanjanaanta, svaraadi):
3636
else:
3737
raise ValueError(svaraadi + " is not svaraadi.")
3838

39-
def split_vyanjanas_and_svaras(self, text):
39+
def split_vyanjanas_and_svaras(self, text, skip_pattern=r"\+\+\+\(.+?\)\+\+\+"):
40+
if skip_pattern is not None:
41+
segments = regex.split(rf"({skip_pattern})", text)
42+
if len(segments) > 1:
43+
letters = []
44+
for segment in segments:
45+
if regex.match(skip_pattern, segment):
46+
letters.append(segment)
47+
else:
48+
letters.extend(self.split_vyanjanas_and_svaras(text=segment, skip_pattern=None))
49+
return letters
4050
def _yogavaaha_accent_match(letter):
4151
return letter in self["yogavaahas"].values() or letter in self.get("accents", {}).values() or regex.match(self.YOGAVAAHAS, letter) or regex.match(self.ACCENTS, letter) is not None or letter in self.get("candra", {}).values()
4252

@@ -68,6 +78,20 @@ def _yogavaaha_accent_match(letter):
6878
out_letters.append(letter)
6979
return out_letters
7080

81+
# Helper to find the index of the next or previous valid syllable, skipping non-syllables.
82+
def get_adjacent_syllable_index(self, start_index, letters, direction, pauses_pattern):
83+
if isinstance(pauses_pattern, str):
84+
pauses_pattern = regex.compile(pauses_pattern)
85+
current_index = start_index + direction
86+
while 0 <= current_index < len(letters):
87+
if pauses_pattern.fullmatch(letters[current_index]):
88+
return None
89+
elif letters[current_index][0] in self["vowels"]:
90+
return current_index
91+
current_index += direction
92+
return None
93+
94+
7195
def get_consonant_letters(self, text):
7296
letters = self.split_vyanjanas_and_svaras(text)
7397
letters = [letter.replace(self["virama"]["्"], "") for letter in letters if letter.replace(self["virama"]["्"], "") in self["consonants"].values()]
@@ -97,15 +121,18 @@ def sandhi_sanskrit(self, str1, str2):
97121
return result
98122

99123

100-
def join_strings(self, strings):
124+
def join_strings(self, strings, do_sandhi=False):
101125
out_text = ""
102126
for letter in strings:
103127
if letter[0] in self["vowels"].values() and out_text.endswith(self["virama"]["्"]):
104128
out_text = out_text[:-1] + self.vowel_to_mark_map.get(letter[0], "")
105129
if len(letter) > 1:
106130
out_text += letter[1:]
107131
else:
108-
out_text = self.sandhi_sanskrit(out_text, letter)
132+
if do_sandhi:
133+
out_text = self.sandhi_sanskrit(out_text, letter)
134+
else:
135+
out_text += letter
109136
return out_text
110137

111138
def get_numerals(self):

indic_transliteration/sanscript/schemes/brahmic/accent.py

Lines changed: 142 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,29 @@
11
import regex
22

3+
ACCENTS_PATTERN = "[\u1CD0-\u1CE8\u1CF9\u1CFA\uA8E0-\uA8F1\u0951-\u0954\u0957]" # included ॗ , which is used as svara for weber's shatapatha
34

4-
ACCENTS_PATTERN = "[\u1CD0-\u1CE8\u1CF9\u1CFA\uA8E0-\uA8F1\u0951-\u0954\u0957]" # included ॗ , which is used as svara for weber's shatapatha
55

6-
def add_accent_to_previous_syllable(scheme, text, old_accent, new_accent=None, drop_at_first_syllable=False, retain_old_accent=False):
6+
def add_accent_to_previous_syllable(scheme, text, old_accent, new_accent=None, drop_at_first_syllable=False,
7+
retain_old_accent=False):
8+
"""
9+
modify text by moving old_accent from its current position to the preceding syllable's vowel or yogavaaha (a semi-vowel or special character that behaves like a vowel) in the form of new_accent. See test cases for example use.
10+
11+
:param scheme:
12+
:param text:
13+
:param old_accent:
14+
:param new_accent:
15+
:param drop_at_first_syllable: Should text like "ॗसैषा᳘" be produced?
16+
:param retain_old_accent:
17+
:return:
18+
"""
719
if new_accent is None:
820
new_accent = old_accent
921
letters = scheme.split_vyanjanas_and_svaras(text)
1022
out_letters = []
1123
vowels = list(scheme["vowels"].values())
1224
vowels_yogavaahas = vowels + list(scheme["yogavaahas"].values())
1325
accent_carryover = ""
26+
1427
for index, letter in enumerate(letters):
1528
if letter.endswith(old_accent):
1629
vowel_position = -1
@@ -22,7 +35,7 @@ def add_accent_to_previous_syllable(scheme, text, old_accent, new_accent=None, d
2235
if vowel_position == -1:
2336
if not drop_at_first_syllable:
2437
accent_carryover += new_accent
25-
else:
38+
else:
2639
out_letters[vowel_position] += new_accent
2740
if not retain_old_accent:
2841
out_letters.append(letter[:-1])
@@ -63,5 +76,130 @@ def set_diirgha_svaritas(scheme, text, accent="᳚"):
6376
text = regex.sub(f"(?<=[{vowel_string}]+)॑", accent, text)
6477
return text
6578

79+
6680
def strip_accents(text):
67-
return regex.sub(ACCENTS_PATTERN, "", text)
81+
return regex.sub(ACCENTS_PATTERN, "", text)
82+
83+
84+
def to_US_accents(text, scheme, UDATTA = "꣡", SVARITA_NEW = "᳕", pauses=r"[।॥\n,;]+", skip_pattern=r"\+\+\+\(.+?\)\+\+\+"):
85+
"""Given text like
86+
ध्रु॒वो॑ऽसि ।
87+
ध्रु॒वो॒॑ऽहँ स॑जा॒तेषु॑ भूयास॒न्
88+
धीर॒श् चेत्ता॑ वसु॒वित्।
89+
produce:
90+
ध्रुवो᳕ऽसि ।
91+
ध्रुवो꣡ऽहँ꣡ सजाते꣡षु भूयासन्
92+
धी꣡रश् चे꣡त्ता वसुवि꣡त्।
93+
"""
94+
# symbol definitions
95+
SANNATARA = "॒"
96+
SVARITA = "॑"
97+
98+
99+
100+
PAUSES_PATTERN = regex.compile(pauses)
101+
SKIP_PATTERN = regex.compile(skip_pattern)
102+
103+
# Split the text into a list of syllables and other elements.
104+
letters = scheme.split_vyanjanas_and_svaras(text, skip_pattern=skip_pattern)
105+
# Example output here - ['स्', "ओ", "+++(=tick)+++", 'ऽ', 'ग्', "न्", "इ॒", "म्", "ए॑", "व्", "अ"]
106+
107+
out_letters = list(letters)
108+
109+
110+
# mark any syllable starting from a pause (or the beginning of out_text) as udAtta, until a sannatara or svarita
111+
for index, letter in enumerate(out_letters):
112+
if PAUSES_PATTERN.fullmatch(letter) or index == 0:
113+
mark_udAtta = True
114+
if mark_udAtta:
115+
# Scan forwards and mark succeeding syllables with Udatta.
116+
curr_fwd_index = scheme.get_adjacent_syllable_index(index-1, out_letters, +1, pauses_pattern=PAUSES_PATTERN)
117+
while mark_udAtta and curr_fwd_index is not None:
118+
syllable_to_check = out_letters[curr_fwd_index]
119+
# Stop if a barrier (a svarita or a pause) is reached.
120+
if any(x in syllable_to_check for x in [SVARITA_NEW, SVARITA, SANNATARA]):
121+
mark_udAtta = False
122+
break
123+
# Add Udatta if not already accented.
124+
if UDATTA not in out_letters[curr_fwd_index]:
125+
out_letters[curr_fwd_index] += UDATTA
126+
curr_fwd_index = scheme.get_adjacent_syllable_index(curr_fwd_index, out_letters, +1,
127+
pauses_pattern=PAUSES_PATTERN)
128+
129+
# --- PASS 1: Handle dependent Svarita (Rule 2) ---
130+
# If a syllable has a svarita and the predecessessor has a sannatara, remove both accents and add a svarita_new to the current syllable.
131+
# This rule (e.g., ध्रु॒वो॑ -> ध्रुवो᳕) is a specific substitution that takes precedence.
132+
for index, letter in enumerate(out_letters):
133+
# If a syllable has a svarita...
134+
if SVARITA in letter:
135+
# ...and the predecessor has a sannatara...
136+
prev_index = scheme.get_adjacent_syllable_index(index, out_letters, -1, pauses_pattern=PAUSES_PATTERN)
137+
if prev_index is not None and SANNATARA in out_letters[prev_index]:
138+
# ...remove both accents and add a svarita_new to the current syllable.
139+
out_letters[prev_index] = out_letters[prev_index].replace(SANNATARA, "")
140+
out_letters[index] = letter.replace(SVARITA, "") + SVARITA_NEW
141+
142+
for index, letter in enumerate(out_letters):
143+
is_kampa = SVARITA in letter and SANNATARA in letter # Rule 1
144+
145+
# If a syllable has both sannatara and svarita signs (like वो॒॑), replace it's accents with udAtta, remove the predecessor's sannatara, and temporarily keep the sannatara in itself.
146+
if is_kampa:
147+
out_letters[index] = letter + UDATTA
148+
# Kampa rule: also remove the predecessor's sannatara.
149+
prev_index = scheme.get_adjacent_syllable_index(index, out_letters, -1)
150+
if prev_index is not None and SANNATARA in out_letters[prev_index]:
151+
out_letters[prev_index] = out_letters[prev_index].replace(SANNATARA, "")
152+
out_letters[index] = out_letters[index].replace(SVARITA, "")
153+
154+
# If a syllable has svarita, mark all preceeding syllables until a sannatara or svarita_new accent or a pause is reached with udAtta; at which point remove any preceding sannatara. Remove the triggering svarita. After this is done for all syllables, there should be no svarita left.
155+
for index, letter in enumerate(out_letters):
156+
# --- Backward "painting" from a Svarita ---
157+
if not SVARITA in letter:
158+
continue
159+
# Remove the source accent(s) from the syllable.
160+
# For Kampa, also add an Udatta to the syllable itself.
161+
out_letters[index] = letter.replace(SVARITA, "")
162+
163+
# Scan backwards and mark preceding syllables with Udatta.
164+
curr_back_index = scheme.get_adjacent_syllable_index(index, out_letters, -1, pauses_pattern=PAUSES_PATTERN)
165+
while curr_back_index is not None:
166+
syllable_to_check = out_letters[curr_back_index]
167+
if any(x in syllable_to_check for x in [SVARITA, SVARITA_NEW, SANNATARA]):
168+
# If the barrier is a sannatara, remove it and stop.
169+
out_letters[curr_back_index] = out_letters[curr_back_index].replace(SANNATARA, "")
170+
break
171+
172+
# Add Udatta if not already accented.
173+
if UDATTA not in out_letters[curr_back_index]:
174+
out_letters[curr_back_index] += UDATTA
175+
curr_back_index = scheme.get_adjacent_syllable_index(curr_back_index, out_letters, -1,
176+
pauses_pattern=PAUSES_PATTERN)
177+
178+
179+
# If a syllable has sannatara, mark all succeeding syllables with udAtta until a svarita is reached or a pause is reached. Remove the triggering sannatara. After this is done for all syllables, there should be no sannatara left.
180+
for index, letter in enumerate(out_letters):
181+
182+
# --- Forward "painting" from a Sannatara ---
183+
# This applies to simple sannatara only. Kampa's effect is handled above.
184+
if SANNATARA in letter:
185+
# Remove the triggering sannatara.
186+
out_letters[index] = letter.replace(SANNATARA, "")
187+
188+
# Scan forwards and mark succeeding syllables with Udatta.
189+
curr_fwd_index = scheme.get_adjacent_syllable_index(index, out_letters, +1, pauses_pattern=PAUSES_PATTERN)
190+
while curr_fwd_index is not None:
191+
syllable_to_check = out_letters[curr_fwd_index]
192+
if SVARITA in syllable_to_check:
193+
out_letters[curr_fwd_index] = out_letters[curr_fwd_index].replace(SVARITA, "")
194+
break
195+
# Stop if a barrier (a svarita or a pause) is reached.
196+
if any(x in syllable_to_check for x in [SVARITA_NEW, SVARITA, SANNATARA]):
197+
break
198+
# Add Udatta if not already accented.
199+
if UDATTA not in out_letters[curr_fwd_index]:
200+
out_letters[curr_fwd_index] += UDATTA
201+
curr_fwd_index = scheme.get_adjacent_syllable_index(curr_fwd_index, out_letters, +1,
202+
pauses_pattern=PAUSES_PATTERN)
203+
204+
text = scheme.join_strings(out_letters)
205+
return text

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@
4444
# Versions should comply with PEP440. For a discussion on single-sourcing
4545
# the version across setup.py and the project code, see
4646
# https://packaging.python.org/en/latest/single_source_version.html
47-
version='2.3.69',
47+
version='2.3.70',
4848

4949

5050
description='Transliteration tools to convert text in one indic script encoding to another',

tests/sanscript/accent_test.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,4 +9,8 @@ def test_move_accent_to_previous_syllable():
99

1010

1111
def test_set_diirgha_svaritas():
12-
assert accent.set_diirgha_svaritas(scheme=sanscript.SCHEMES[sanscript.DEVANAGARI], text="त॑स्माद्वा॑ अप॑ उ॑पस्पृशति॥ सो᳕ऽग्नि॑मेवा᳕भी॑क्षमाणः।") == "त॑स्माद्वा᳚ अप॑ उ॑पस्पृशति॥ सो᳕ऽग्नि॑मेवा᳕भी᳚क्षमाणः।"
12+
assert accent.set_diirgha_svaritas(scheme=sanscript.SCHEMES[sanscript.DEVANAGARI], text="त॑स्माद्वा॑ अप॑ उ॑पस्पृशति॥ सो᳕ऽग्नि॑मेवा᳕भी॑क्षमाणः।") == "त॑स्माद्वा᳚ अप॑ उ॑पस्पृशति॥ सो᳕ऽग्नि॑मेवा᳕भी᳚क्षमाणः।"
13+
14+
15+
def test_to_US_accents():
16+
assert accent.to_US_accents(scheme=sanscript.SCHEMES[sanscript.DEVANAGARI], text="""ध्रु॒वो॑ऽसि । \nध्रु॒वो॒॑ऽहँ स॑जा॒तेषु॑+++(=haya)+++ भूयास॒न् \nधीर॒श् चेत्ता॑ वसु॒वित्।""") == "ध्रुवो᳕ऽसि । \nध्रुवो᳕ऽहँ꣡ सजाते꣡षु+++(=haya)+++ भूयासन् \nधी꣡रश् चे꣡त्ता वसुवि꣡त्।"

tests/sanscript/brahmic_test.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,8 +45,12 @@ def test_do_vyanjana_svara_join():
4545

4646
def test_split_vyanjanas_and_svaras():
4747
devanagari = sanscript.SCHEMES[sanscript.DEVANAGARI]
48+
49+
assert devanagari.split_vyanjanas_and_svaras("सो+++(=tick)+++ऽग्नि᳘मेॗव", skip_pattern=r"\+\+\+\(.+?\)\+\+\+") == ['स्', "ओ", "+++(=tick)+++", 'ऽ', 'ग्', "न्", "इ᳘", "म्", "एॗ", "व्", "अ"]
50+
4851
assert devanagari.split_vyanjanas_and_svaras("नु॑") == ['न्', 'उ॑']
4952
assert devanagari.split_vyanjanas_and_svaras("सोऽग्नि᳘मेॗव") == ['स्', "ओ", 'ऽ', 'ग्', "न्", "इ᳘", "म्", "एॗ", "व्", "अ"]
53+
5054
assert devanagari.split_vyanjanas_and_svaras("मं") == ['म्', 'अं']
5155
assert devanagari.split_vyanjanas_and_svaras("ह्रीः") == ['ह्', 'र्', 'ईः']
5256
assert sanscript.SCHEMES[sanscript.KANNADA].split_vyanjanas_and_svaras("ಹ್ರೀಃ") == ["ಹ್", "ರ್", "ಈಃ"]

0 commit comments

Comments
 (0)