sd

vvasuki · vvasuki · commit e40a31742736 · 2025-08-05T17:29:16.000+05:30
diff --git a/indic_transliteration/sanscript/schemes/brahmic/__init__.py b/indic_transliteration/sanscript/schemes/brahmic/__init__.py
@@ -36,7 +36,17 @@ def do_vyanjana_svara_join(self, vyanjanaanta, svaraadi):
     else:
       raise ValueError(svaraadi + " is not svaraadi.")
 
-  def split_vyanjanas_and_svaras(self, text):
+  def split_vyanjanas_and_svaras(self, text, skip_pattern=r"\+\+\+\(.+?\)\+\+\+"):
+    if skip_pattern is not None:
+      segments = regex.split(rf"({skip_pattern})", text)
+      if len(segments) > 1:
+        letters = []
+        for segment in segments:
+          if regex.match(skip_pattern, segment):
+            letters.append(segment)
+          else:
+            letters.extend(self.split_vyanjanas_and_svaras(text=segment, skip_pattern=None))
+        return letters
     def _yogavaaha_accent_match(letter):
       return letter in self["yogavaahas"].values() or letter in self.get("accents", {}).values() or regex.match(self.YOGAVAAHAS, letter) or regex.match(self.ACCENTS, letter) is not None or letter in self.get("candra", {}).values()
     
@@ -68,6 +78,20 @@ def _yogavaaha_accent_match(letter):
         out_letters.append(letter)
     return out_letters
 
+  # Helper to find the index of the next or previous valid syllable, skipping non-syllables.
+  def get_adjacent_syllable_index(self, start_index, letters, direction, pauses_pattern):
+    if isinstance(pauses_pattern, str):
+      pauses_pattern = regex.compile(pauses_pattern)
+    current_index = start_index + direction
+    while 0 <= current_index < len(letters):
+      if pauses_pattern.fullmatch(letters[current_index]):
+        return None
+      elif letters[current_index][0] in self["vowels"]:
+        return current_index
+      current_index += direction
+    return None
+
+
   def get_consonant_letters(self, text):
     letters = self.split_vyanjanas_and_svaras(text)
     letters = [letter.replace(self["virama"]["्"], "") for letter in letters if letter.replace(self["virama"]["्"], "") in self["consonants"].values()]
@@ -97,15 +121,18 @@ def sandhi_sanskrit(self, str1, str2):
       return result
 
 
-  def join_strings(self, strings):
+  def join_strings(self, strings, do_sandhi=False):
     out_text = ""
     for letter in strings:
       if letter[0] in self["vowels"].values() and out_text.endswith(self["virama"]["्"]):
         out_text = out_text[:-1] + self.vowel_to_mark_map.get(letter[0], "")
         if len(letter) > 1:
           out_text += letter[1:]
       else:
-        out_text = self.sandhi_sanskrit(out_text, letter)
+        if do_sandhi:
+          out_text = self.sandhi_sanskrit(out_text, letter)
+        else:
+          out_text += letter
     return out_text
 
   def get_numerals(self):
diff --git a/indic_transliteration/sanscript/schemes/brahmic/accent.py b/indic_transliteration/sanscript/schemes/brahmic/accent.py
@@ -1,16 +1,29 @@
 import regex
 
+ACCENTS_PATTERN = "[\u1CD0-\u1CE8\u1CF9\u1CFA\uA8E0-\uA8F1\u0951-\u0954\u0957]"  # included  ॗ , which is used as svara for weber's shatapatha
 
-ACCENTS_PATTERN = "[\u1CD0-\u1CE8\u1CF9\u1CFA\uA8E0-\uA8F1\u0951-\u0954\u0957]" # included  ॗ , which is used as svara for weber's shatapatha
 
-def add_accent_to_previous_syllable(scheme, text, old_accent, new_accent=None, drop_at_first_syllable=False, retain_old_accent=False):
+def add_accent_to_previous_syllable(scheme, text, old_accent, new_accent=None, drop_at_first_syllable=False,
+                                    retain_old_accent=False):
+  """
+  modify text by moving old_accent from its current position to the preceding syllable's vowel or yogavaaha (a semi-vowel or special character that behaves like a vowel) in the form of new_accent. See test cases for example use.
+
+  :param scheme: 
+  :param text: 
+  :param old_accent: 
+  :param new_accent: 
+  :param drop_at_first_syllable: Should text like "ॗसैषा᳘" be produced?
+  :param retain_old_accent: 
+  :return: 
+  """
   if new_accent is None:
     new_accent = old_accent
   letters = scheme.split_vyanjanas_and_svaras(text)
   out_letters = []
   vowels = list(scheme["vowels"].values())
   vowels_yogavaahas = vowels + list(scheme["yogavaahas"].values())
   accent_carryover = ""
+
   for index, letter in enumerate(letters):
     if letter.endswith(old_accent):
       vowel_position = -1
@@ -22,7 +35,7 @@ def add_accent_to_previous_syllable(scheme, text, old_accent, new_accent=None, d
       if vowel_position == -1:
         if not drop_at_first_syllable:
           accent_carryover += new_accent
-      else: 
+      else:
         out_letters[vowel_position] += new_accent
       if not retain_old_accent:
         out_letters.append(letter[:-1])
@@ -63,5 +76,130 @@ def set_diirgha_svaritas(scheme, text, accent="᳚"):
   text = regex.sub(f"(?<=[{vowel_string}]+)॑", accent, text)
   return text
 
+
 def strip_accents(text):
-  return regex.sub(ACCENTS_PATTERN, "", text)
+  return regex.sub(ACCENTS_PATTERN, "", text)
+
+
+def to_US_accents(text, scheme, UDATTA = "꣡", SVARITA_NEW = "᳕", pauses=r"[।॥\n,;]+", skip_pattern=r"\+\+\+\(.+?\)\+\+\+"):
+  """Given text like  
+  ध्रु॒वो॑ऽसि ।  
+  ध्रु॒वो॒॑ऽहँ स॑जा॒तेषु॑ भूयास॒न्  
+  धीर॒श् चेत्ता॑ वसु॒वित्। 
+  produce:
+  ध्रुवो᳕ऽसि ।  
+  ध्रुवो꣡ऽहँ꣡ सजाते꣡षु भूयासन्  
+  धी꣡रश् चे꣡त्ता वसुवि꣡त्। 
+  """
+  # symbol definitions
+  SANNATARA = "॒"
+  SVARITA = "॑"
+  
+  
+
+  PAUSES_PATTERN = regex.compile(pauses)
+  SKIP_PATTERN = regex.compile(skip_pattern)
+
+  # Split the text into a list of syllables and other elements.
+  letters = scheme.split_vyanjanas_and_svaras(text, skip_pattern=skip_pattern)
+  # Example output here - ['स्', "ओ", "+++(=tick)+++", 'ऽ', 'ग्', "न्", "इ॒", "म्", "ए॑", "व्", "अ"]
+
+  out_letters = list(letters)
+
+
+  # mark any syllable starting from a pause (or the beginning of out_text) as udAtta, until a sannatara or svarita
+  for index, letter in enumerate(out_letters):
+    if PAUSES_PATTERN.fullmatch(letter) or index == 0:
+      mark_udAtta = True
+    if mark_udAtta:
+      # Scan forwards and mark succeeding syllables with Udatta.
+      curr_fwd_index = scheme.get_adjacent_syllable_index(index-1, out_letters, +1, pauses_pattern=PAUSES_PATTERN)
+      while mark_udAtta and curr_fwd_index is not None:
+        syllable_to_check = out_letters[curr_fwd_index]
+        # Stop if a barrier (a svarita or a pause) is reached.
+        if any(x in syllable_to_check for x in [SVARITA_NEW, SVARITA, SANNATARA]):
+          mark_udAtta = False
+          break
+        # Add Udatta if not already accented.
+        if UDATTA not in out_letters[curr_fwd_index]:
+          out_letters[curr_fwd_index] += UDATTA
+        curr_fwd_index = scheme.get_adjacent_syllable_index(curr_fwd_index, out_letters, +1,
+                                                            pauses_pattern=PAUSES_PATTERN)
+
+  # --- PASS 1: Handle dependent Svarita (Rule 2) ---
+  # If a syllable has a svarita and the predecessessor has a sannatara, remove both accents and add a svarita_new to the current syllable.
+  # This rule (e.g., ध्रु॒वो॑ -> ध्रुवो᳕) is a specific substitution that takes precedence.
+  for index, letter in enumerate(out_letters):
+    # If a syllable has a svarita...
+    if SVARITA in letter:
+      # ...and the predecessor has a sannatara...
+      prev_index = scheme.get_adjacent_syllable_index(index, out_letters, -1, pauses_pattern=PAUSES_PATTERN)
+      if prev_index is not None and SANNATARA in out_letters[prev_index]:
+        # ...remove both accents and add a svarita_new to the current syllable.
+        out_letters[prev_index] = out_letters[prev_index].replace(SANNATARA, "")
+        out_letters[index] = letter.replace(SVARITA, "") + SVARITA_NEW
+
+  for index, letter in enumerate(out_letters):
+    is_kampa = SVARITA in letter and SANNATARA in letter  # Rule 1
+
+    # If a syllable has both sannatara and svarita signs (like वो॒॑), replace it's accents with udAtta, remove the predecessor's sannatara, and temporarily keep the sannatara in itself. 
+    if is_kampa:
+      out_letters[index] = letter + UDATTA
+      # Kampa rule: also remove the predecessor's sannatara.
+      prev_index = scheme.get_adjacent_syllable_index(index, out_letters, -1)
+      if prev_index is not None and SANNATARA in out_letters[prev_index]:
+        out_letters[prev_index] = out_letters[prev_index].replace(SANNATARA, "")
+        out_letters[index] = out_letters[index].replace(SVARITA, "")
+
+  # If a syllable has svarita, mark all preceeding syllables until a sannatara or svarita_new accent or a pause is reached with udAtta; at which point remove any preceding sannatara. Remove the triggering svarita. After this is done for all syllables, there should be no svarita left. 
+  for index, letter in enumerate(out_letters):
+    # --- Backward "painting" from a Svarita ---
+    if not SVARITA in letter:
+      continue
+    # Remove the source accent(s) from the syllable.
+    # For Kampa, also add an Udatta to the syllable itself.
+    out_letters[index] = letter.replace(SVARITA, "")
+
+    # Scan backwards and mark preceding syllables with Udatta.
+    curr_back_index = scheme.get_adjacent_syllable_index(index, out_letters, -1, pauses_pattern=PAUSES_PATTERN)
+    while curr_back_index is not None:
+      syllable_to_check = out_letters[curr_back_index]
+      if any(x in syllable_to_check for x in [SVARITA, SVARITA_NEW, SANNATARA]):
+        # If the barrier is a sannatara, remove it and stop.
+        out_letters[curr_back_index] = out_letters[curr_back_index].replace(SANNATARA, "")
+        break
+
+      # Add Udatta if not already accented.
+      if UDATTA not in out_letters[curr_back_index]:
+        out_letters[curr_back_index] += UDATTA
+      curr_back_index = scheme.get_adjacent_syllable_index(curr_back_index, out_letters, -1,
+                                                           pauses_pattern=PAUSES_PATTERN)
+
+
+  # If a syllable has sannatara, mark all succeeding syllables with udAtta until a svarita is reached or a pause is reached. Remove the triggering sannatara. After this is done for all syllables, there should be no sannatara left.
+  for index, letter in enumerate(out_letters):
+
+    # --- Forward "painting" from a Sannatara ---
+    # This applies to simple sannatara only. Kampa's effect is handled above.
+    if SANNATARA in letter:
+      # Remove the triggering sannatara.
+      out_letters[index] = letter.replace(SANNATARA, "")
+
+      # Scan forwards and mark succeeding syllables with Udatta.
+      curr_fwd_index = scheme.get_adjacent_syllable_index(index, out_letters, +1, pauses_pattern=PAUSES_PATTERN)
+      while curr_fwd_index is not None:
+        syllable_to_check = out_letters[curr_fwd_index]
+        if SVARITA in syllable_to_check:
+          out_letters[curr_fwd_index] = out_letters[curr_fwd_index].replace(SVARITA, "")
+          break
+        # Stop if a barrier (a svarita or a pause) is reached.
+        if any(x in syllable_to_check for x in [SVARITA_NEW, SVARITA, SANNATARA]):
+          break
+        # Add Udatta if not already accented.
+        if UDATTA not in out_letters[curr_fwd_index]:
+          out_letters[curr_fwd_index] += UDATTA
+        curr_fwd_index = scheme.get_adjacent_syllable_index(curr_fwd_index, out_letters, +1,
+                                                            pauses_pattern=PAUSES_PATTERN)
+
+  text = scheme.join_strings(out_letters)
+  return text
diff --git a/setup.py b/setup.py
@@ -44,7 +44,7 @@
   # Versions should comply with PEP440.  For a discussion on single-sourcing
   # the version across setup.py and the project code, see
   # https://packaging.python.org/en/latest/single_source_version.html
-  version='2.3.69',
+  version='2.3.70',
 
 
   description='Transliteration tools to convert text in one indic script encoding to another',
diff --git a/tests/sanscript/accent_test.py b/tests/sanscript/accent_test.py
@@ -9,4 +9,8 @@ def test_move_accent_to_previous_syllable():
 
 
 def test_set_diirgha_svaritas():
-  assert accent.set_diirgha_svaritas(scheme=sanscript.SCHEMES[sanscript.DEVANAGARI], text="त॑स्माद्वा॑ अप॑ उ॑पस्पृशति॥ सो᳕ऽग्नि॑मेवा᳕भी॑क्षमाणः।") == "त॑स्माद्वा᳚ अप॑ उ॑पस्पृशति॥ सो᳕ऽग्नि॑मेवा᳕भी᳚क्षमाणः।"
+  assert accent.set_diirgha_svaritas(scheme=sanscript.SCHEMES[sanscript.DEVANAGARI], text="त॑स्माद्वा॑ अप॑ उ॑पस्पृशति॥ सो᳕ऽग्नि॑मेवा᳕भी॑क्षमाणः।") == "त॑स्माद्वा᳚ अप॑ उ॑पस्पृशति॥ सो᳕ऽग्नि॑मेवा᳕भी᳚क्षमाणः।"
+  
+  
+def test_to_US_accents():
+  assert accent.to_US_accents(scheme=sanscript.SCHEMES[sanscript.DEVANAGARI], text="""ध्रु॒वो॑ऽसि ।  \nध्रु॒वो॒॑ऽहँ स॑जा॒तेषु॑+++(=haya)+++ भूयास॒न्  \nधीर॒श् चेत्ता॑ वसु॒वित्।""") == "ध्रुवो᳕ऽसि ।  \nध्रुवो᳕ऽहँ꣡ सजाते꣡षु+++(=haya)+++ भूयासन्  \nधी꣡रश् चे꣡त्ता वसुवि꣡त्।"
diff --git a/tests/sanscript/brahmic_test.py b/tests/sanscript/brahmic_test.py
@@ -45,8 +45,12 @@ def test_do_vyanjana_svara_join():
 
 def test_split_vyanjanas_and_svaras():
   devanagari = sanscript.SCHEMES[sanscript.DEVANAGARI]
+
+  assert devanagari.split_vyanjanas_and_svaras("सो+++(=tick)+++ऽग्नि᳘मेॗव", skip_pattern=r"\+\+\+\(.+?\)\+\+\+") == ['स्', "ओ", "+++(=tick)+++", 'ऽ', 'ग्', "न्", "इ᳘", "म्", "एॗ", "व्", "अ"]
+
   assert devanagari.split_vyanjanas_and_svaras("नु॑") == ['न्', 'उ॑']
   assert devanagari.split_vyanjanas_and_svaras("सोऽग्नि᳘मेॗव") == ['स्', "ओ", 'ऽ', 'ग्', "न्", "इ᳘", "म्", "एॗ", "व्", "अ"]
+
   assert devanagari.split_vyanjanas_and_svaras("मं") == ['म्', 'अं']
   assert devanagari.split_vyanjanas_and_svaras("ह्रीः") == ['ह्', 'र्', 'ईः']
   assert sanscript.SCHEMES[sanscript.KANNADA].split_vyanjanas_and_svaras("ಹ್ರೀಃ") == ["ಹ್", "ರ್", "ಈಃ"]